# TensorFlow BERT

# Load Libraries, Data, Tokenizer

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer, BertConfig, TFBertModel
from tqdm import tqdm
tqdm.pandas()

In [None]:
MAX_LEN = 128
BERT_CONFIG = '/kaggle/input/bertconfig/bert-base-uncased-config.json'
BERT_PATH = '/kaggle/input/bert-base-uncased-huggingface-transformer//bert-base-uncased-tf_model.h5'
TOKENIZER = tokenizers.BertWordPieceTokenizer("/kaggle/input/bert-base-uncased-huggingface-transformer//bert-base-uncased-vocab.txt", lowercase=True)

train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv').fillna('')
submission = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')

# Training Data

In [None]:
def process_data(tweet, selected_text, tokenizer):
    len_st = len(selected_text)
    idx0 = None
    idx1 = None
    
    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
        if tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st
            break
    
    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1):
            char_targets[ct] = 1
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets

    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    targets = [0] * len(input_ids_orig)
    for idx in target_idx:
        targets[idx] = 1
    return targets

def convert_to_transformer_inputs(text, tokenizer, max_length):
    inputs = tokenizer.encode(text)
    input_ids =  inputs.ids
    input_masks = inputs.attention_mask
    input_segments = inputs.type_ids
    padding_length = max_length - len(input_ids)
    padding_id = 0
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)
    return [input_ids, input_masks, input_segments]

train['targets'] = train.progress_apply(lambda row: process_data(str(row['text']), str(row['selected_text']),TOKENIZER),axis=1)
train['targets'] = train['targets'].apply(lambda x :x + [0] * (MAX_LEN-len(x)))

input_ids, input_masks, input_segments = [], [], []
for _, instance in tqdm(train.iterrows()):
    ids, masks, segments= convert_to_transformer_inputs(str(instance.text),TOKENIZER, MAX_LEN)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)
inputs = [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]
outputs = np.asarray(train['targets'].values.tolist())

# Test Data

In [None]:
input_ids, input_masks, input_segments = [], [], []
for _, instance in tqdm(test.iterrows()):
    ids, masks, segments= convert_to_transformer_inputs(str(instance.text),TOKENIZER, MAX_LEN)
    input_ids.append(ids)
    input_masks.append(masks)
    input_segments.append(segments)
test_inputs = [np.asarray(input_ids, dtype=np.int32), np.asarray(input_masks, dtype=np.int32), np.asarray(input_segments, dtype=np.int32)]

# Build BERT Model

In [None]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    mask = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    attn = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    bert_conf = BertConfig() 
    bert_model = TFBertModel.from_pretrained(BERT_PATH, config=bert_conf)
    
    output = bert_model(ids, attention_mask=mask, token_type_ids=attn)
    
    out = tf.keras.layers.Dropout(0.1)(output[0]) 
    out = tf.keras.layers.Conv1D(1,1)(out)
    out = tf.keras.layers.Flatten()(out)
    out = tf.keras.layers.Activation('softmax')(out)
    model = tf.keras.models.Model(inputs=[ids, mask, attn], outputs=out)
    return model

# Train BERT Model

In [None]:
K.clear_session()
model = build_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)
model.fit(inputs,outputs, epochs=16, batch_size=32)
model.save_weights(f'finetuned_bert.h5')
predictions = model.predict(test_inputs, batch_size=32, verbose=1)
pred = np.where(predictions>0.4, 1,0)

In [None]:
def decode(tweet,idx_start,idx_end,offsets):
    output  = ""
    for ix in range(idx_start, idx_end + 1):
        output += tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            output += " "
    return output

final = []
for test_idx in range(test.shape[0]):
    indexes = list(np.where(pred[test_idx]==1)[0])
    text = str(test.loc[test_idx,'text'])
    encoded_text = TOKENIZER.encode(text)
    if len(indexes)>0:
        start_tokens = indexes[0]
        end_tokens =  indexes[-1]
    else: 
        start_tokens = 0
        end_tokens = len(encoded_text.ids) - 1
    if start_tokens>end_tokens: 
        selected_text = test.loc[test_idx,'text']
    else:
        selected_text = decode(text,start_tokens,end_tokens,encoded_text.offsets)
    final.append(selected_text)
    
test['selected_text'] = final
submission['selected_text'] = test['selected_text']
submission.to_csv('submission.csv')