In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Conv1D, Flatten, Dropout, Activation, LeakyReLU
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import StratifiedKFold
import tensorflow.keras.backend as K

from shutil import copyfile
#copy our file into the working directory (make sure it has .py suffix)
copyfile(src = "../input/bert-tokenization/bert_tokenization.py", dst = "../working/bert_tokenization.py")

import bert_tokenization
#from transformers import BertTokenizer

Import libraries that we need

# Data Overview

In [None]:
df_train = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')
df_test = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/test.csv')
df_train.isnull().sum()

In [None]:
df_train.dropna(how='any',axis=0,inplace=True)
df_train.head()

In [None]:
df_test.isnull().sum()

In [None]:
df_test.head()

In [None]:
colors = sns.color_palette()
plt.subplot(211)
sentiment_num_1 = df_train['sentiment'].value_counts()
sentiment_num_1.plot(kind='bar',figsize=(10,10),color=colors[0],rot=0)
plt.title('Sentiment Distribution for Train Data')

plt.subplot(212)
sentiment_num_2 = df_test['sentiment'].value_counts()
sentiment_num_2.plot(kind='bar',figsize=(10,10),color=colors[1],rot=0)
plt.title('Sentiment Distribution for Test Data')

plt.tight_layout(pad =3)
plt.show()

Simple visualization for data distribution (The sentiment distributions for train and test data are almost same)

In [None]:
df_train['word_cnt_full_texts'] = df_train['text'].apply(lambda x: len(x.split()))
df_train['word_cnt_sel_texts'] = df_train['selected_text'].apply(lambda x: len(x.split()))
fig,axes = plt.subplots(nrows=3,ncols=1,figsize=(8,20))
for i,s in enumerate(['positive','negative','neutral']):
    sns.distplot(df_train[df_train.sentiment==s]['word_cnt_full_texts'],
                 bins=20, color='skyblue', label='full texts', ax=axes[i])
    sns.distplot(df_train[df_train.sentiment==s]['word_cnt_sel_texts'],
                 bins=20, color='red', label='sel texts', ax=axes[i])
    axes[i].legend(fontsize=14)
    axes[i].set_title('%s: full texts length vs selected texts length'%(s),fontsize=15,fontweight='bold')

It looks like for neutral tweets, slected texts and full texts are almost the same. While for positive and negative tweets, selected texts are only a small part of full texts.

# Bert

First I take a look at how the tokenizer works

In [None]:
bert_layer = hub.KerasLayer('../input/berthub', trainable=True)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
print(df_train['text'][1]) 
print(tokenizer.tokenize(df_train['text'][1]))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df_train['text'][1])))

In [None]:
print(df_train['selected_text'][1])
print(tokenizer.tokenize(df_train['selected_text'][1]))
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df_train['selected_text'][1])))

In [None]:
print(tokenizer.convert_tokens_to_ids(['[CLS]']))
print(tokenizer.convert_tokens_to_ids(['[SEP]']))

## Encode train and test data

Then I segment encoding part step by step

Step 1. Decompose full texts into three parts: texts before selected texts, selected texts and texts after selected texts

I use the second sentence as an example

In [None]:
train_texts = df_train['text']
train_sel_texts = df_train['selected_text']
train_sentiment = df_train['sentiment']
texts = list(train_texts)
sentiments = list(train_sentiment)
sel_texts = list(train_sel_texts)
len(sel_texts),len(texts),len(sentiments)

In [None]:
sentiments[1],texts[1],sel_texts[1]

In [None]:
start_idx = texts[1].find(sel_texts[1])
end_idx = start_idx + len(sel_texts[1])-1                
sentiment = sentiments[1]
full_text_1 = tokenizer.tokenize(texts[1][:start_idx])
full_text_2 = tokenizer.tokenize(texts[1][start_idx:end_idx+1])
full_text_3 = tokenizer.tokenize(texts[1][end_idx+1:])
sentiment,full_text_1,full_text_2,full_text_3

Step 2. tokenize three input arrays: input_ids,attention_masks,type_ids

In [None]:
max_len=150

In [None]:
input_tokens = ['[CLS]'] + [sentiment] + ['[SEP]'] + full_text_1+ full_text_2 + full_text_3 +['[SEP]']
pad_len = max_len - len(input_tokens)
valid_input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
input_ids = valid_input_ids + [0]*pad_len
attention_masks = [1]*len(valid_input_ids) + [0]*pad_len
type_ids = [0]*3 + [1]*(len(valid_input_ids)-3)+[0]*pad_len
print(input_tokens)
print(input_ids)
print(attention_masks)
print(type_ids)

Step 3. tokenize two output arrays: start and end

In [None]:
start_tokens = [0]*(len(full_text_1)+3)+[1]+[0]*(max_len-len(full_text_1)-4)
end_tokens = [0]*(len(full_text_1)+len(full_text_2)+2)+[1]+[0]*(max_len-len(full_text_1)-len(full_text_2)-3)
print(start_tokens)
print(end_tokens)

Check if all the arrays have the same length

In [None]:
len(input_ids),len(attention_masks),len(type_ids),len(start_tokens),len(end_tokens)

## Wrap the step-by-step encoding methods

In [None]:
def bert_encode_train(sentiments, texts, sel_texts, tokenizer, max_len =512):
    all_input_ids = []
    all_masks = []
    all_type_ids = []
    all_start_tokens = []
    all_end_tokens = []
    
    
    for i in range(len(texts)):
        
        start_idx = texts[i].find(sel_texts[i])
        end_idx = start_idx + len(sel_texts[i])-1                
        sentiment = sentiments[i]
        full_text_1 = tokenizer.tokenize(texts[i][:start_idx])
        full_text_2 = tokenizer.tokenize(texts[i][start_idx:end_idx+1])
        full_text_3 = tokenizer.tokenize(texts[i][end_idx+1:])
        
        input_tokens = ['[CLS]'] + [sentiment] + ['[SEP]'] + full_text_1+ full_text_2 + full_text_3 +['[SEP]']
        pad_len = max_len - len(input_tokens)
        valid_input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = valid_input_ids + [0]*pad_len
        attention_masks = [1]*len(valid_input_ids) + [0]*pad_len
        type_ids = [0]*3 + [1]*(len(valid_input_ids)-3) + [0]*pad_len
        #type_ids = [0]*len(input_ids)
        
        start_tokens = [0]*(len(full_text_1)+3)+[1]+[0]*(max_len-len(full_text_1)-4)
        end_tokens = [0]*(len(full_text_1)+len(full_text_2)+2)+[1]+[0]*(max_len-len(full_text_1)-len(full_text_2)-3)
        
        all_input_ids.append(input_ids)
        all_masks.append(attention_masks)
        all_type_ids.append(type_ids)
        all_start_tokens.append(start_tokens) 
        all_end_tokens.append(end_tokens)
        
    return np.array(all_input_ids), np.array(all_masks), np.array(all_type_ids),np.array(all_start_tokens),np.array(all_end_tokens)

Check max length after bert encoding

In [None]:
max_len = 0
for i in range(df_train.shape[0]+1):
    try:
        tokens = tokenizer.tokenize(df_train['text'][i])
        input_ids = tokenizer.convert_tokens_to_ids(['[CLS]']+list(df_train.loc[i,'sentiment'])+
                                       ['[SEP]']+tokens+['[SEP]'])
        max_len = max(max_len, len(input_ids))
    except:
        pass

print('Max length for training data: ', max_len)

In [None]:
max_len = 0
for i in range(df_train.shape[0]+1):
    try:
        tokens = tokenizer.tokenize(df_test['text'][i])
        input_ids = tokenizer.convert_tokens_to_ids(['[CLS]']+list(df_test.loc[i,'sentiment'])+
                                       ['[SEP]']+tokens+['[SEP]'])
        max_len = max(max_len, len(input_ids))
    except:
        pass

print('Max length for test data: ', max_len)

Since the word count of full texts and selected texts for neutral text is almost the same, I only use positive and negative texts as training and test data. 

In [None]:
train_texts = df_train[df_train['sentiment']!='neutral']['text']
train_sel_texts = df_train[df_train['sentiment']!='neutral']['selected_text']
train_sentiment = df_train[df_train['sentiment']!='neutral']['sentiment']
#train_texts = df_train['text']
#train_sel_texts = df_train['selected_text']
#train_sentiment = df_train['sentiment']
full_texts = list(train_texts)
sentiments = list(train_sentiment)
sel_texts = list(train_sel_texts)

train_input = bert_encode_train(sentiments,full_texts,sel_texts,tokenizer, max_len =150)[:3]
train_labels = bert_encode_train(sentiments,full_texts,sel_texts,tokenizer, max_len =150)[3:]

In [None]:
def bert_encode_test(sentiments, texts, tokenizer, max_len =512):
    all_input_ids = []
    all_masks = []
    all_type_ids = []

    for i in range(len(texts)):
        text = tokenizer.tokenize(texts[i])               
        sentiment = sentiments[i]
        input_tokens = ['[CLS]'] + [sentiment] + ['[SEP]'] + text +['[SEP]']
        pad_len = max_len - len(input_tokens)
        valid_input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
        input_ids = valid_input_ids + [0]*pad_len
        attention_masks = [1]*len(valid_input_ids) + [0]*pad_len
        type_ids = [0]*3 + [1]*(len(valid_input_ids)-3) + [0]*pad_len
        #type_ids = [0]*len(input_ids)
        
        all_input_ids.append(input_ids)
        all_masks.append(attention_masks)
        all_type_ids.append(type_ids)
      
    return np.array(all_input_ids), np.array(all_masks), np.array(all_type_ids)

In [None]:
test_texts = df_test[df_test['sentiment']!='neutral']['text']
test_sentiment = df_test[df_test['sentiment']!='neutral']['sentiment']
#test_texts = df_test['text']
#test_sentiment = df_test['sentiment']
full_texts_test = list(test_texts)
sentiments_test = list(test_sentiment)

test_input = bert_encode_test(sentiments_test, full_texts_test, tokenizer, max_len =150)

## Build Bert Model with CNN head

In [None]:
# K.clear_session()

In [None]:
def build_bert(bert_layer, max_len =512):
    adam = Adam(lr=3e-5)
    main_input = Input(shape =(max_len,), dtype =tf.int32)
    input_word_ids = Input(shape = (max_len,),dtype =tf.int32)
    input_mask = Input(shape = (max_len,),dtype =tf.int32)
    input_type_ids = Input(shape = (max_len,),dtype =tf.int32)
    
    clf_output = bert_layer([input_word_ids, input_mask, input_type_ids])
    #pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
    #clf_output = sequence_output[1]
    
    out1 = Dropout(0.1)(clf_output[1])
    out1 = Conv1D(filters=1, kernel_size=1) (out1)
    out1 = Flatten()(out1)
    out1 = Activation('softmax')(out1)
    
    out2 = Dropout(0.1)(clf_output[1])
    out2 = Conv1D(filters=1, kernel_size=1) (out2)
    out2 = Flatten()(out2)
    out2 = Activation('softmax')(out2)
    
    model = Model(inputs = [input_word_ids, input_mask, input_type_ids], outputs =[out1,out2])
    model.compile(optimizer=Adam(lr=3e-5) ,loss = 'categorical_crossentropy')
    print(model.summary())
    return model


model = build_bert(bert_layer,max_len=150)
filepath='best_weight.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(train_input, train_labels, epochs =3, batch_size = 16, callbacks=[checkpoint], validation_split=0.2)
#for layer in model.layers:
    #print(layer.output_shape)

In [None]:
def jaccard_similarity(str1,str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    if not a and not b:
        return 0.5 
    c = a.intersection(b)
    return float(len(c)/(len(a)+len(b)-len(c)))

In [None]:
jaccard_similarity(' Sooo SAD I will miss you here in San Diego!!!','Sooo SAD')

# Predict and Submission

When it comes to the submission, selected texts of neutral texts are assigned with full texts.

In [None]:
best_model = load_model('./best_weight.hdf5',custom_objects={'KerasLayer':bert_layer})
pred_start,pred_end = model.predict(test_input)
results = []
for k in range(test_input[0].shape[0]):
    a = np.argmax(pred_start[k])
    b = np.argmax(pred_end[k])
    
    if a>b:
        sel_text = full_texts_test[k]
    else:
        sel_text = ' '.join(tokenizer.convert_ids_to_tokens(test_input[0][k,a:b+1]))
        
    results.append(sel_text)

#google fulltokenizer will generate meaingless punction ##   
results = [x.replace(' ##','') for x in results]
    
for k in range(df_test.shape[0]):
    if df_test.loc[k, 'sentiment'] == 'neutral':
        df_test.loc[k, 'selected_text'] = df_test.loc[k, 'text']

df_test.loc[df_test['sentiment']!='neutral','selected_text'] = results
#df_test['selected_text'] = results
output = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/sample_submission.csv')
output['selected_text'] = df_test['selected_text']

In [None]:
output.to_csv('submission.csv',index=False,header=True)

In [None]:
output['text'] = df_test['text']
output['sentiment'] = df_test['sentiment']
output.head(10)