# 1. Libraries

In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
from nltk.tokenize import word_tokenize
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version', tf.__version__)

TF version 2.6.0


# 2. Main settings

In [2]:
n_splits = 5
EPOCHS = 3
BATCH_SIZE = 32
MAX_LEN = 96
PAD_ID = 1
LABEL_SMOOTHING = 0.1

PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    PATH+'vocab-roberta-base.json',
    PATH+'merges-roberta-base.txt',
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

# 3. Functions

In [3]:
def read_train():
    """
        Load train dataset
    """
    train = pd.read_csv(
        '../input/tweet-sentiment-extraction/train.csv').fillna('')
    train['text'] = train['text'].astype(str)
    train['selected_text'] = train['selected_text'].astype(str)
    return train


def read_test():
    """
        Load test dataset
    """
    test = pd.read_csv(
        '../input/tweet-sentiment-extraction/test.csv').fillna('')
    test['text'] = test['text'].astype(str)
    return test


def read_submission():
    """
        Load submission template
    """
    sub = pd.read_csv(
        '../input/tweet-sentiment-extraction/sample_submission.csv').fillna('')
    return sub


def jaccard_improve(str1, str2):
    str1 = str1.lower()
    str2 = str2.lower()
    index = str1.find(str2)
    text1 = str1[:index]

    text2 = str1[index:].replace(str2, '')
    words1 = text1.split()
    words2 = text2.split()

    if len(words1) > len(words2):
        words1 = words1[-3:]
        mod_text = " ".join(words1)+" " + str2
    else:
        words2 = words2[0:2]
        mod_text = str2+" "+" ".join(words2)
    return mod_text


def loss_fn(y_true, y_pred):
    """
       Tune loss function for Model 
    """
    # adjust the targets for sequence bucketing
    l = tf.shape(y_pred)[1]
    y_true = y_true[:, :l]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
                                                    from_logits=False, 
                                                    label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss


def scheduler(epoch):
    """Tune sceduler for learning rate"""
    return 3e-5 * 0.2**epoch


def build_model():
    """
        Builds Model and tune shape for inputs.
        As main part of Model we use pretrained roDerta model.
        And two heads with Convolution layers.
    """
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(
        PATH+'pretrained-roberta-base.h5', config=config)

    x = bert_model(ids, attention_mask=att, token_type_ids=tok)

    x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2, padding='same')(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)

    x2 = tf.keras.layers.Dropout(0.1)(x[0])
    x2 = tf.keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1, x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss=loss_fn, optimizer=optimizer)

    return model


def jaccard(str1, str2):
    """
        Calculates Jaccard score
    """
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    if (len(a) == 0) & (len(b) == 0):
        return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# 4. Load data

In [4]:
train = read_train()
test = read_test()
submission_df = read_submission()

# 5. Prepare data for Model

As we see in previous notebook (BiLSTM), "neutral" part has least influence at result, so we will train our model only with "positive" and "negative" tweets

In [5]:
train_df = train.loc[train.sentiment !=
                     "neutral"].reset_index(drop=True, inplace=False)
test_df = test.loc[test.sentiment != "neutral"].reset_index(
    drop=True, inplace=False)
print(f"non neutral train data: {train_df.shape}")
print(f"non neutral test data: {test_df.shape}")

non neutral train data: (16363, 4)
non neutral test data: (2104, 3)


#### Make templates for Model's inputs

In [6]:
ct = train_df.shape[0]
input_ids = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32')
start_tokens = np.zeros((ct, MAX_LEN), dtype='int32')
end_tokens = np.zeros((ct, MAX_LEN), dtype='int32')

print(input_ids.shape)
print(attention_mask.shape)
print(token_type_ids.shape)
print(start_tokens.shape)
print(end_tokens.shape)

(16363, 96)
(16363, 96)
(16363, 96)
(16363, 96)
(16363, 96)


#### Prepare train data

In [7]:
for k in range(train_df.shape[0]):

    # FIND OVERLAP
    text1 = " "+" ".join(train_df.loc[k, 'text'].split())
    text2 = " ".join(train_df.loc[k, 'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)] = 1
    text1[idx-1] == ' '
    if text1[idx-1] == ' ':
        chars[idx-1] = 1
    enc = tokenizer.encode(text1)

    # ID_OFFSETS
    offsets = []
    idx = 0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx, idx+len(w)))
        idx += len(w)

    # START END TOKENS
    toks = []
    for i, (a, b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm > 0:
            toks.append(i)

    # Insert Paddings and separators
    s_tok = sentiment_id[train_df.loc[k, 'sentiment']]
    input_ids[k, :len(enc.ids)+5] = [0] + [s_tok] + [2, 2] + enc.ids + [2]
    attention_mask[k, :len(enc.ids)+5] = 1
    if len(toks) > 0:
        start_tokens[k, toks[0]+1] = 1
        end_tokens[k, toks[-1]+1] = 1

#### Prepare test data

In [8]:
ct = test_df.shape[0]
input_ids_t = np.ones((ct, MAX_LEN), dtype='int32')
attention_mask_t = np.zeros((ct, MAX_LEN), dtype='int32')
token_type_ids_t = np.zeros((ct, MAX_LEN), dtype='int32')

for k in range(test_df.shape[0]):

    # INPUT_IDS
    text1 = " "+" ".join(test_df.loc[k, 'text'].split())

    enc = tokenizer.encode(text1)
    s_tok = sentiment_id[test_df.loc[k, 'sentiment']]
    input_ids_t[k, :len(enc.ids)+5] = [0] + [s_tok] + [2, 2] + enc.ids + [2]
    attention_mask_t[k, :len(enc.ids)+5] = 1

# 6. Train model

In [9]:
jac = []
VER = 'v3'
DISPLAY = 1  # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0], MAX_LEN))
oof_end = np.zeros((input_ids.shape[0], MAX_LEN))

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=777)
for fold, (idxT, idxV) in enumerate(skf.split(input_ids, train_df.sentiment.values)):

    print('#'*25)
    print('### FOLD %i' % (fold+1))
    print('#'*25)

    K.clear_session()
    model = build_model()

    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5' % (VER, fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')

    hist = model.fit([input_ids[idxT, ], attention_mask[idxT, ], token_type_ids[idxT, ]],
                     [start_tokens[idxT, ], end_tokens[idxT, ]],
                     epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=DISPLAY,
                     callbacks=[sv, reduce_lr],
                     validation_data=([input_ids[idxV, ], attention_mask[idxV, ], token_type_ids[idxV, ]],
                                      [start_tokens[idxV, ], end_tokens[idxV, ]]))

    print('Loading model...')
    model.load_weights('%s-roberta-%i.h5' % (VER, fold))

    print('Predicting OOF...')
    oof_start[idxV, ], oof_end[idxV, ] = model.predict(
        [input_ids[idxV, ], attention_mask[idxV, ], token_type_ids[idxV, ]], verbose=DISPLAY)

    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k, ])
        b = np.argmax(oof_end[k, ])
        if a > b:
            # IMPROVE CV/LB with better choice here
            st = train_df.loc[k, 'text']
        else:
            text1 = " "+" ".join(train_df.loc[k, 'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        all.append(jaccard(st, train_df.loc[k, 'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard =' % (fold+1), np.mean(all))
    print()


print('>>>> OVERALL 5Fold CV Jaccard =', np.mean(jac))

#########################
### FOLD 1
#########################


2021-11-15 21:22:25.371830: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-15 21:22:25.373472: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-15 21:22:25.374552: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-15 21:22:25.375927: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Epoch 1/3


2021-11-15 21:23:09.120647: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005



Epoch 00001: val_loss improved from inf to 3.72824, saving model to v3-roberta-0.h5
Epoch 2/3

Epoch 00002: val_loss improved from 3.72824 to 3.65186, saving model to v3-roberta-0.h5
Epoch 3/3

Epoch 00003: val_loss improved from 3.65186 to 3.65146, saving model to v3-roberta-0.h5
Loading model...
Predicting OOF...
>>>> FOLD 1 Jaccard = 0.5337233023417904

#########################
### FOLD 2
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3

Epoch 00001: val_loss improved from inf to 3.82585, saving model to v3-roberta-1.h5
Epoch 2/3

Epoch 00002: val_loss improved from 3.82585 to 3.67795, saving model to v3-roberta-1.h5
Epoch 3/3

Epoch 00003: val_loss did not improve from 3.67795
Loading model...
Predicting OOF...
>>>> FOLD 2 Jaccard = 0.5276962230252922

#########################
### FOLD 3
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3

Epoch 00001: val_loss improved from inf to 3.75606, saving model to v3-roberta-2.h5
Epoch 2/3

Epoch 00002: val_loss improved from 3.75606 to 3.69625, saving model to v3-roberta-2.h5
Epoch 3/3

Epoch 00003: val_loss did not improve from 3.69625
Loading model...
Predicting OOF...
>>>> FOLD 3 Jaccard = 0.5208550696760247

#########################
### FOLD 4
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3

Epoch 00001: val_loss improved from inf to 3.76618, saving model to v3-roberta-3.h5
Epoch 2/3

Epoch 00002: val_loss improved from 3.76618 to 3.73472, saving model to v3-roberta-3.h5
Epoch 3/3

Epoch 00003: val_loss improved from 3.73472 to 3.72169, saving model to v3-roberta-3.h5
Loading model...
Predicting OOF...
>>>> FOLD 4 Jaccard = 0.5226844958409589

#########################
### FOLD 5
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/3

Epoch 00001: val_loss improved from inf to 3.85596, saving model to v3-roberta-4.h5
Epoch 2/3

Epoch 00002: val_loss improved from 3.85596 to 3.70972, saving model to v3-roberta-4.h5
Epoch 3/3

Epoch 00003: val_loss did not improve from 3.70972
Loading model...
Predicting OOF...
>>>> FOLD 5 Jaccard = 0.5233789775598878

>>>> OVERALL 5Fold CV Jaccard = 0.5256676136887908


# 7. Predictions

In [10]:
preds_start = np.zeros((input_ids_t.shape[0], MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0], MAX_LEN))
DISPLAY = 1
for i in range(5):
    print('#'*25)
    print('### MODEL %i' % (i+1))
    print('#'*25)

    K.clear_session()
    model = build_model()
    # model.load_weights('../input/model-v3/v3-roberta-%i.h5'%i)
    model.load_weights('./v3-roberta-%i.h5' % i)

    print('Predicting Test...')
    preds = model.predict([input_ids_t, attention_mask_t,
                           token_type_ids_t], verbose=DISPLAY)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

#########################
### MODEL 1
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Predicting Test...
#########################
### MODEL 2
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Predicting Test...
#########################
### MODEL 3
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Predicting Test...
#########################
### MODEL 4
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Predicting Test...
#########################
### MODEL 5
#########################


All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at ../input/tf-roberta/pretrained-roberta-base.h5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Predicting Test...


#### Get text by index

In [11]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k, ])
    b = np.argmax(preds_end[k, ])
    if a > b:
        st = test_df.loc[k, 'text']
    else:
        text1 = " "+" ".join(test_df.loc[k, 'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)

# 8. Submission

In [12]:
test["selected_text"] = ""  # make a column for predictions

# make predictions for neutral sentiment
test.loc[test.sentiment == "neutral",
         "selected_text"] = test.loc[test.sentiment == "neutral", "text"]

# make predictions for neutral sentiment
test.loc[test.sentiment != "neutral", "selected_text"] = all

In [13]:
test.shape, test_df.shape, submission_df.shape

((3534, 4), (2104, 3), (3534, 2))

In [14]:
#test_df['selected_text'] = all
test[['textID', 'selected_text']].to_csv('submission.csv', index=False)

In [15]:
test.head()

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,such a shame!
3,01082688c6,happy bday!,positive,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,i like it!!


# Conclusion

During work on this part we try to use different type of text preprocessing, described in EDA and BiLSTM notebooks. Finally, we desided, that any preprocessing not give positive result at all. Also, we use different type of heads. Simple Dense layes give good result and more complex head let's us improve result at 10-15%. Obtained result: 0.71141/0.70859

What next:
1. You can use ansible with different models
2. You can use different text augumentation methods