# About this kernel
### This kernel is Just exploring the possibilities of using BERT in this competition as NER approach.
### This is just a starter kernel, alot can be improved from here.

### Inspired from this @akensert's kernel [bert-base-tf2-0-now-huggingface-transformer](https://www.kaggle.com/akensert/bert-base-tf2-0-now-huggingface-transformer)
### These kernels were very helpful [tensorflow-roberta-0-705](https://www.kaggle.com/cdeotte/tensorflow-roberta-0-705) and [bert-base-uncased-using-pytorch](https://www.kaggle.com/abhishek/bert-base-uncased-using-pytorch)
### <font color='red'>If you find this kernel helpful please upvote ðŸ˜Š. (Don't Just Fork Only)</font>

# Import the necessary libraries

In [None]:
import os
import gc
import numpy as np 
import pandas as pd
import tensorflow as tf
import tensorflow.keras.backend as K
import tokenizers
from transformers import BertTokenizer,BertConfig,TFBertModel
from tqdm import tqdm

tqdm.pandas()
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
DATA_PATH = '/kaggle/input/tweet-sentiment-extraction/'
train_df = pd.read_csv(DATA_PATH+'train.csv')
test_df = pd.read_csv(DATA_PATH+'test.csv')
submission_df = pd.read_csv(DATA_PATH+'sample_submission.csv')

In [None]:
class config:
    MAX_LEN = 128
    TRAIN_BATCH_SIZE = 64
    VALID_BATCH_SIZE = 32
    TEST_BATCH_SIZE = 32
    EPOCHS = 5
    BERT_CONFIG = "/kaggle/input/bertconfig/bert-base-uncased-config.json" 
    BERT_PATH = "/kaggle/input/bert-base-uncased-huggingface-transformer/"
    TOKENIZER = tokenizers.BertWordPieceTokenizer(
        f"{BERT_PATH}/bert-base-uncased-vocab.txt", 
        lowercase=True)
    SAVEMODEL_PATH = '/kaggle/input/tftweetfinetuned/finetuned_bert.h5'
    THRESHOLD = 0.4

# Create Targets

In [None]:
def process_data(tweet, selected_text, tokenizer):
    len_st = len(selected_text)
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[0]):
        if tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1):
            char_targets[ct] = 1
            
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets

    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
            
    targets = [0] * len(input_ids_orig)
    for idx in target_idx:
        targets[idx] = 1
    return targets

In [None]:
train_df['targets'] = train_df.progress_apply(lambda row: process_data(
                                                                    str(row['text']), 
                                                                    str(row['selected_text']),
                                                                    config.TOKENIZER),
                                                                    axis=1)

In [None]:
## pad all the targets
train_df['targets'] = train_df['targets'].apply(lambda x :x + [0] * (config.MAX_LEN-len(x)))

# Convert into Bert Format

In [None]:
def _convert_to_transformer_inputs(text, tokenizer, max_sequence_length):
    inputs = tokenizer.encode(text)
    input_ids =  inputs.ids
    input_masks = inputs.attention_mask
    input_segments = inputs.type_ids
    padding_length = max_sequence_length - len(input_ids)
    padding_id = 0
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)
    return [input_ids, input_masks, input_segments]

def compute_input_arrays(df, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df.iterrows()):
        ids, masks, segments= _convert_to_transformer_inputs(str(instance.text),tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]

def compute_output_arrays(df, columns):
    return np.asarray(df[columns].values.tolist())

In [None]:
outputs = compute_output_arrays(train_df,'targets')
inputs = compute_input_arrays(train_df, config.TOKENIZER, config.MAX_LEN)
test_inputs = compute_input_arrays(test_df, config.TOKENIZER, config.MAX_LEN)

# Create the model

In [None]:
def create_model():
    ids = tf.keras.layers.Input((config.MAX_LEN,), dtype=tf.int32)
    mask = tf.keras.layers.Input((config.MAX_LEN,), dtype=tf.int32)
    attn = tf.keras.layers.Input((config.MAX_LEN,), dtype=tf.int32)
    bert_conf = BertConfig() 
    #bert_conf.output_hidden_states = True
    bert_model = TFBertModel.from_pretrained(config.BERT_PATH+'/bert-base-uncased-tf_model.h5', config=bert_conf)
    
    output = bert_model(ids, attention_mask=mask, token_type_ids=attn)
    
    out = tf.keras.layers.Dropout(0.1)(output[0]) 
    out = tf.keras.layers.Conv1D(1,1)(out)
    out = tf.keras.layers.Flatten()(out)
    out = tf.keras.layers.Activation('sigmoid')(out)
    model = tf.keras.models.Model(inputs=[ids, mask, attn], outputs=out)
    return model

# Training

In [None]:
K.clear_session()
model = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

In [None]:
if not os.path.exists(config.SAVEMODEL_PATH):
    model.fit(inputs,outputs, epochs=config.EPOCHS, batch_size=config.TRAIN_BATCH_SIZE)
    model.save_weights(f'finetuned_bert.h5')
else:
    model.load_weights(config.SAVEMODEL_PATH)

# Prediction

In [None]:
predictions = model.predict(test_inputs, batch_size=32, verbose=1)

In [None]:
## Will change to higher threshold in upcoming versions
threshold = config.THRESHOLD
pred = np.where(predictions>threshold,1,0)

In [None]:
def decode_tweet(original_tweet,idx_start,idx_end,offsets):
    filtered_output  = ""
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "
    return filtered_output

In [None]:
outputs = []
for test_idx in range(test_df.shape[0]):
    indexes = list(np.where(pred[test_idx]==1)[0])
    text = str(test_df.loc[test_idx,'text'])
    encoded_text = config.TOKENIZER.encode(text)
    if len(indexes)>0:
        start = indexes[0]
        end =  indexes[-1]
    else:  ### if we found nothing above threshold
        start = 0
        end = len(encoded_text.ids) - 1
    if end >= len(encoded_text.ids): ## -1 for SEP token at last
        end = len(encoded_text.ids) - 1
    if start>end: 
        selected_text = test_df.loc[test_idx,'text']
    else:
        selected_text = decode_tweet(text,start,end,encoded_text.offsets)
    outputs.append(selected_text)

In [None]:
test_df['selected_text'] = outputs

In [None]:
def replacer(row):
    if row['sentiment'] == 'neutral' or len(row['text'].split())<2:
        return row['text']
    else:
        return row['selected_text']
test_df['selected_text'] = test_df.apply(replacer,axis=1)

In [None]:
test_df.head(100)

In [None]:
submission_df['selected_text'] = test_df['selected_text']
submission_df.to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 80)
submission_df.sample(20)