## Libraries

In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from keras.utils import plot_model
from keras.callbacks import ModelCheckpoint,EarlyStopping


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tokenizers import BertWordPieceTokenizer

from keras.layers import BatchNormalization
import tensorflow as tf
import keras
from keras.constraints import unit_norm
from keras import regularizers
from keras import backend as K
from keras.layers import Input, Embedding,Flatten,concatenate, Conv1D, Bidirectional,Dropout
from keras.models import load_model
from numpy.testing import assert_allclose

In [None]:
# reading data set
data = pd.read_csv('/kaggle/input/tweet-sentiment-extraction/train.csv')

In [None]:
# removing a noisy data point
data = data[data.textID != '12f21c8f19']
data

In [None]:
# removing empty rows
data['text'].replace('', np.nan, inplace=True)
data.dropna(subset=['text'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [None]:
data['text'] = data['text'].apply(lambda x: " ".join(x.split()))
data['selected_text'] = data['selected_text'].apply(lambda x: " ".join(x.split()))


In [None]:
data = data.astype({"text": str, "selected_text": str, 'sentiment': str})
data

## Splitting data into train cv and test

In [None]:

x_train,x_test = train_test_split(data, test_size = 0.05, random_state=42)
x_train,x_cv = train_test_split(x_train, test_size = 0.1, random_state = 42)

print("x_train shape is", x_train.shape)
print("x_cv shape is", x_cv.shape)
print("x_test shape is", x_test.shape)


In [None]:
# index reset.
x_train.reset_index(inplace = True, drop = True)
x_cv.reset_index(inplace = True, drop = True)
x_test.reset_index(inplace = True, drop = True)

In [None]:
# https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn
fig,ax = plt.subplots(figsize = (15,5), nrows =1, ncols = 3)
ax = ax.flatten()
sns.countplot(x_train.sentiment, ax = ax[0], order = ['neutral', 'positive', 'negative'])
total = x_train.shape[0]
for p in ax[0].patches:
    height = p.get_height()
    ax[0].text(p.get_x()+p.get_width()/2.,
            height + 4,
            '{:1.2f}%'.format(height*100/total),
            ha="center") 
sns.countplot(x_cv.sentiment, ax = ax[1], order = ['neutral', 'positive', 'negative'])
total = x_cv.shape[0]
for p in ax[1].patches:
    height = p.get_height()
    ax[1].text(p.get_x()+p.get_width()/2.,
            height + 4,
            '{:1.2f}%'.format(height*100/total),
            ha="center") 
sns.countplot(x_test.sentiment, ax = ax[2], order = ['neutral', 'positive', 'negative'])
total = x_test.shape[0]
for p in ax[2].patches:
    height = p.get_height()
    ax[2].text(p.get_x()+p.get_width()/2.,
            height + 4,
            '{:1.2f}%'.format(height*100/total),
            ha="center") 

In [None]:
x_train

In [None]:
print(x_train.shape)
print(x_cv.shape)
print(x_test.shape)

In [None]:
# Load the fast tokenizer from saved file
from nltk.tokenize import WordPunctTokenizer 

tokenizer = WordPunctTokenizer()

In [None]:
# spliting text into words using tokenizer.
train_text = []

for i in range(x_train.shape[0]):
    enc = tokenizer.tokenize(x_train.loc[i,'text'])
    try:
        
        train_text.append(enc)
    except:
        print(temp)
    
cv_text = []

for i in range(x_cv.shape[0]):
    enc = tokenizer.tokenize(x_cv.loc[i,'text'])
    
    cv_text.append(enc) 

test_text = []

for i in range(x_test.shape[0]):
    enc = tokenizer.tokenize(x_test.loc[i,'text'])
    
    test_text.append(enc)

In [None]:
# splitting selected_text into words using tokenizer.
train_stext = []

for i in range(x_train.shape[0]):
    enc = tokenizer.tokenize(x_train.loc[i,'selected_text'])
    
    
    train_stext.append(enc)
    
cv_stext = []

for i in range(x_cv.shape[0]):
    enc = tokenizer.tokenize(x_cv.loc[i,'selected_text'])
    

    cv_stext.append(enc) 

test_stext = []

for i in range(x_test.shape[0]):
    enc = tokenizer.tokenize(x_test.loc[i,'selected_text'])
    
    
    test_stext.append(enc)

## Vectorization

In [None]:
#https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
# tokenizing text to sequences and padding.


text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(train_text)
vocab_size_1 = len(text_tokenizer.word_index) + 1
# integer encode the documents
print("vocab size is:",vocab_size_1)

train_text = text_tokenizer.texts_to_sequences((train_text))
cv_text = text_tokenizer.texts_to_sequences((cv_text))
test_text = text_tokenizer.texts_to_sequences((test_text))

train_select_text = text_tokenizer.texts_to_sequences(list(train_stext))
cv_select_text = text_tokenizer.texts_to_sequences(list(cv_stext))
test_select_text = text_tokenizer.texts_to_sequences(list(test_stext))


max_length = 64 # max length of a tweet

train_text = pad_sequences(train_text, maxlen=max_length, padding='post')
cv_text =  pad_sequences(cv_text, maxlen=max_length, padding='post')
test_text = pad_sequences(test_text, maxlen = max_length, padding = 'post')




print("no. of rows sequences in train:",len(train_text))
print("no. of rows of sequences in validataion:", len(cv_text))
print("max length of sequences",max_length)

In [None]:
# sample datapoint
i = 1
print('text:')
print(x_train.loc[i,'text'])
print('sequence of text:')
print(text_tokenizer.texts_to_sequences([x_train.loc[i,'text']]))
print('sequence of text after padding:')
print(train_text[i])
print('select text:')
print(x_train.loc[i,'selected_text'])
print('sequence of select text:')

print(train_select_text[i])

In [None]:
# tokenizing sentiment.
sentiment_tokenizer = Tokenizer()
sentiment_tokenizer.fit_on_texts(x_train['sentiment'])
vocab_size_2 = len(sentiment_tokenizer.word_index) +1

train_sentiment = sentiment_tokenizer.texts_to_sequences(x_train['sentiment'])
cv_sentiment = sentiment_tokenizer.texts_to_sequences(x_cv['sentiment'])
test_sentiment = sentiment_tokenizer.texts_to_sequences(x_test['sentiment'])


print(sentiment_tokenizer.word_index)

## Train target creation

In [None]:
train_start = np.zeros((x_train.shape[0],64), dtype = 'int32')
train_end = np.zeros((x_train.shape[0],64), dtype = 'int32')


for k in range(x_train.shape[0]):
        tweet = x_train.loc[k,'text']
        selected_text = x_train.loc[k,'selected_text']
        tweet1 = x_train.loc[k,'text'].replace(" ", "")
        selected_text1 = x_train.loc[k,'selected_text'].replace(" ", "")
        idx0 = None
        idx1 = None
        
        st_len = len(selected_text1)
        for i in range(len(tweet1)):
            if(tweet1[i:i+st_len]==selected_text1):
                idx0 = i
                idx1 = i + st_len -1
                break

        char_targets = [0]*len(tweet1)

        for i in range(len(tweet1)):
            if idx0 != None and idx1!=None:
                if i>=idx0 and i<=idx1:
                    char_targets[i] = 1

        # ID_OFFSETS
        offsets = []; idx=0
        for t in tokenizer.tokenize(tweet):
            offsets.append((idx,idx+len(t)))
            idx += len(t)
            
        targets_index = []
        for i, (off1,off2) in enumerate(offsets):
            if sum(char_targets[off1:off2])>0:
                targets_index.append(i)

        train_start[k,targets_index[0]] = 1
        train_end[k,targets_index[-1]] = 1
        
        

In [None]:
# To join selected words back to sentence.
# https://stackoverflow.com/questions/21948019/python-untokenize-a-sentence
import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", "`")
    step6 = step6.replace(" '","'")
    step6 = step6.replace("# ", '#')
    step6 =step6.replace(" - ","-")
    step6 = step6.replace(" ?","?" )
    step6 = step6.replace(' ,',',')
    step6 = step6.replace(' !','!')
    step6 = step6.replace('( ','(')
    step6 = step6.replace('< ','<')
    step6 = step6.replace(' && ','&&')
    step6 = step6.replace(' / ', '/')
    step6 = step6.replace('. com','.com')
    step6 = step6.replace(' :// ', '//')
    #step6 = step6.replace(' = ', '=')
    step6 = step6.replace('$ ','$')
    return step6.strip()

tokenized = ['I', "'ve", 'found', 'a', 'medicine', 'for', 'my','disease', '.']
sent = untokenize(tokenized)
sent

In [None]:
# coverting lists to array
train_select_text = np.array(train_select_text)
cv_select_text = np.array(cv_select_text)
test_select_text = np.array(test_select_text)

train_sentiment = np.array(train_sentiment)
cv_sentiment =np.array(cv_sentiment)
test_sentiment =np.array(test_sentiment)

In [None]:
# metric
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
# checking whether all created targets are correct.
count = 0
score=0
sample = 0
for i in range(x_train.shape[0]):
    
    temp = tokenizer.tokenize(x_train.loc[i,'text'])
    
    offsets = []; idx=0
    for t in tokenizer.tokenize(tweet):
        offsets.append((idx,idx+len(t)))
        idx += len(t)
    if (x_train.loc[i,'selected_text']== untokenize(temp[np.argmax(train_start[i]):np.argmax(train_end[i])+1])):
        count+=1
    elif(sample<10):
        print(i)
        print('actual:',x_train.loc[i,'selected_text'])
        print('label:',untokenize(temp[np.argmax(train_start[i]):np.argmax(train_end[i])+1]))
        print('jaccard:',jaccard(x_train.loc[i,'selected_text'],untokenize(temp[np.argmax(train_start[i]):np.argmax(train_end[i])+1])))
        sample+=1

    score+=jaccard(x_train.loc[i,'selected_text'],untokenize(temp[np.argmax(train_start[i]):np.argmax(train_end[i])+1]))

print('##############################################################################')    
if count ==x_train.shape[0]:
    print('all targets are correct')
    print('jaccard score is:',score/x_train.shape[0])
else:
    print(count,'targets are correct')
    print('jaccard score is:',score/x_train.shape[0])    

## Note: There will be some errors in target creation due to noisy labels in data.

## CV target creation

In [None]:
cv_start = np.zeros((x_cv.shape[0],64), dtype = 'int32')
cv_end = np.zeros((x_cv.shape[0],64), dtype = 'int32')


for k in range(x_cv.shape[0]):
        tweet = x_cv.loc[k,'text']
        selected_text = x_cv.loc[k,'selected_text']
        tweet1 = x_cv.loc[k,'text'].replace(" ", "")
        selected_text1 = x_cv.loc[k,'selected_text'].replace(" ", "")
        idx0 = None
        idx1 = None
        
        st_len = len(selected_text1)
        for i in range(len(tweet1)):
            if(tweet1[i:i+st_len]==selected_text1):
                idx0 = i
                idx1 = i + st_len -1
                break

        char_targets = [0]*len(tweet1)

        for i in range(len(tweet1)):
            if idx0 != None and idx1!=None:
                if i>=idx0 and i<=idx1:
                    char_targets[i] = 1

        # ID_OFFSETS
        offsets = []; idx=0
        for t in tokenizer.tokenize(tweet):
            offsets.append((idx,idx+len(t)))
            idx += len(t)
            
        targets_index = []
        for i, (off1,off2) in enumerate(offsets):
            if sum(char_targets[off1:off2])>0:
                targets_index.append(i)

        cv_start[k,targets_index[0]] = 1
        cv_end[k,targets_index[-1]] = 1
        
        

In [None]:
# checking whether all created targets are correct.
count = 0
score=0
sample = 0
for i in range(x_cv.shape[0]):
    
    temp = tokenizer.tokenize(x_cv.loc[i,'text'])
    
    offsets = []; idx=0
    for t in tokenizer.tokenize(tweet):
        offsets.append((idx,idx+len(t)))
        idx += len(t)
    if (x_cv.loc[i,'selected_text']== untokenize(temp[np.argmax(cv_start[i]):np.argmax(cv_end[i])+1])):
        count+=1
    elif(sample<10):
        print(i)
        print('actual:',x_cv.loc[i,'selected_text'])
        print('label:',untokenize(temp[np.argmax(cv_start[i]):np.argmax(cv_end[i])+1]))
        print('jaccard:',jaccard(x_cv.loc[i,'selected_text'],untokenize(temp[np.argmax(cv_start[i]):np.argmax(cv_end[i])+1])))
        sample+=1

    score+=jaccard(x_cv.loc[i,'selected_text'],untokenize(temp[np.argmax(cv_start[i]):np.argmax(cv_end[i])+1]))

print('##############################################################################')    
if count ==x_cv.shape[0]:
    print('all targets are correct')
    print('jaccard score is:',score/x_cv.shape[0])
else:
    print(count,'targets are correct')
    print('jaccard score is:',score/x_cv.shape[0])    

## Test targets creation

In [None]:
test_start = np.zeros((x_test.shape[0],64), dtype = 'int32')
test_end = np.zeros((x_test.shape[0],64), dtype = 'int32')


for k in range(x_test.shape[0]):
        tweet = x_test.loc[k,'text']
        selected_text = x_test.loc[k,'selected_text']
        tweet1 = x_test.loc[k,'text'].replace(" ", "")
        selected_text1 = x_test.loc[k,'selected_text'].replace(" ", "")
        idx0 = None
        idx1 = None
        
        st_len = len(selected_text1)
        for i in range(len(tweet1)):
            if(tweet1[i:i+st_len]==selected_text1):
                idx0 = i
                idx1 = i + st_len -1
                break

        char_targets = [0]*len(tweet1)

        for i in range(len(tweet1)):
            if idx0 != None and idx1!=None:
                if i>=idx0 and i<=idx1:
                    char_targets[i] = 1

        # ID_OFFSETS
        offsets = []; idx=0
        for t in tokenizer.tokenize(tweet):
            offsets.append((idx,idx+len(t)))
            idx += len(t)
            
        targets_index = []
        for i, (off1,off2) in enumerate(offsets):
            if sum(char_targets[off1:off2])>0:
                targets_index.append(i)

        test_start[k,targets_index[0]] = 1
        test_end[k,targets_index[-1]] = 1
        
        

In [None]:
# checking whether all created targets are correct.
count = 0
score=0
sample = 0
for i in range(x_test.shape[0]):
    
    temp = tokenizer.tokenize(x_test.loc[i,'text'])
    
    offsets = []; idx=0
    for t in tokenizer.tokenize(tweet):
        offsets.append((idx,idx+len(t)))
        idx += len(t)
    if (x_test.loc[i,'selected_text']== untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1])):
        count+=1
    elif(sample<10):
        print(i)
        print('actual:',x_test.loc[i,'selected_text'])
        print('label:',untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]))
        print('jaccard:',jaccard(x_test.loc[i,'selected_text'],untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1])))
        sample+=1

    score+=jaccard(x_test.loc[i,'selected_text'],untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]))

print('##############################################################################')    
if count ==x_test.shape[0]:
    print('all targets are correct')
    print('jaccard score is:',score/x_test.shape[0])
else:
    print(count,'targets are correct')
    print('jaccard score is:',score/x_test.shape[0])    

## word embeddings

In [None]:
EMBEDDING_FILE = '../input/glove-twitter/glove.twitter.27B.200d.txt'
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
count=0
word_index = text_tokenizer.word_index
nb_words = len(word_index) + 1
#change below line if computing normal stats is too slow
embedding_matrix = embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector
        count+=1
count

In [None]:
# padding select text sequences
train_select_text = pad_sequences(train_select_text, maxlen=max_length, padding='post')
cv_select_text =  pad_sequences(cv_select_text, maxlen=max_length, padding='post')
test_select_text = pad_sequences(test_select_text, maxlen = max_length, padding = 'post')


In [None]:
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]
count=0
word_index = sentiment_tokenizer.word_index
nb_words = len(word_index) + 1
#change below line if computing normal stats is too slow
embedding_matrix2  = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix2[i] = embedding_vector
        count+=1
count

## Modelling

In [None]:
# https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html


def build_model(n1,n2,n3,n4,drop,mode,bidir=False):

    """
    inputs:
    
    n1: no. of units in first layer if mode is 'lstm' else no. of filters in conv layer
    n2: no. of units in second layer if mode is 'lstm' else kernel size in conv layer
    n3: no. of neurons in first dense layer
    n4: no. of neurons in second dense layer
    mode: lstm/conv
    bidir: normal lstm or bidirectional lstm
    drop: dropout rate
    
    action:
    
    creates a neural network based on given inputs
    
    output:
    
    returns the model
    
    """
    
    keras.backend.clear_session()
    
    i1 = Input(shape=(64,), dtype='int32')
    e = Embedding(vocab_size_1, 200, weights=[embedding_matrix],  trainable=False )(i1)
    if(mode=='lstm'):
        if bidir:
            x1 = Bidirectional(keras.layers.GRU(n1, return_sequences=True, kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001) ))(e)
            x1 = Bidirectional(keras.layers.GRU(n2, return_sequences=True, kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001) ))(x1)
        
            
            i2 = Input(shape=(1,), dtype='int32')
            e = Embedding(vocab_size_2, 200,weights=[embedding_matrix2] ,trainable= 'False')(i2)
            x2 = e

        else:
            x1 = keras.layers.GRU(n1, return_sequences=True, kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001) )(e)
            x1 = keras.layers.GRU(n2, return_sequences=True, kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001) )(x1)
            i2 = Input(shape=(1,), dtype='int32')
            e = Embedding(vocab_size_2, 200,weights=[embedding_matrix2] ,trainable= 'False')(i2)
            x2= keras.layers.GRU(n2, return_sequences=True, kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001) )(e)
    elif(mode=='conv'):
        x1=Conv1D(n1,n2,activation = 'relu',)(e)
        x1=Conv1D(n1/2,n2,activation = 'relu',)(x1)
    
    
        i2 = Input(shape=(1,), dtype='int32')
        e = Embedding(vocab_size_2, 200,weights=[embedding_matrix2] ,trainable= 'False')(i2)
        x2=Conv1D(n1/2,n2,activation = 'relu',)(e)
    

    
    x2 = tf.keras.layers.Flatten()(x2)
    x1 = tf.keras.layers.Flatten()(x1)
            
    con = tf.keras.layers.Concatenate()([x1,x2])
    
    x1 = keras.layers.Dense(n3, activation = 'relu', kernel_initializer='he_uniform',kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001))(con)
 
    x1 = Dropout(drop)(x1)
    

    x2 = keras.layers.Dense(n3, activation = 'relu', kernel_initializer='he_uniform',kernel_constraint=unit_norm(),kernel_regularizer=regularizers.l2(0.0001))(con)
    
    x2 = Dropout(drop)(x2)

    
    
    output1 = keras.layers.Dense(64, activation = 'softmax')(x1)
    output2 = keras.layers.Dense(64, activation = 'softmax')(x2)

    model = keras.models.Model(inputs =[i1,i2], outputs = [output1,output2] )

    opt = keras.optimizers.Adam(lr=3e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0,clipnorm=1)

    model.compile(optimizer = opt, loss = 'categorical_crossentropy' )


    return model


In [None]:
# model checkpoint to save best model.
filepath = "/kaggle/working/best_model.h5" 
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')



In [None]:
model = build_model(128,64,64,16,0.2,'lstm',True )


In [None]:
plot_model(model, show_shapes = True)


In [None]:
#model.fit([train_text,train_sentiment],[train_start,train_end], validation_data = ([cv_text,cv_sentiment], [cv_start, cv_end]),epochs =20, batch_size = 32, callbacks= [checkpoint])

In [None]:
#https://stackoverflow.com/questions/51700351/valueerror-unknown-metric-function-when-using-custom-metric-in-keras
# loading the bestmodel out of three.
best_model = load_model("../input/nn-word-attention/best_model (10).h5",)

In [None]:
test_start,test_end= best_model.predict([test_text,test_sentiment])

In [None]:
# calculating the jaccard score.

score = 0
for i in range(x_test.shape[0]):
    temp = tokenizer.tokenize(x_test.loc[i,'text'])
    
    score = score + jaccard(untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]),x_test.selected_text[i])
    

    
print(score/x_test.shape[0])        

In [None]:
# calculating the jaccard score.

score = 0
x_test['jaccard_score'] = 0
for i in range(x_test.shape[0]):
    
    if x_test.loc[i,'sentiment']!='neutral':
        temp = tokenizer.tokenize(x_test.loc[i,'text'])
        
        if np.argmax(test_start[i])<=np.argmax(test_end[i]):
            x_test.loc[i,'jaccard_score'] = jaccard(untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]),x_test.selected_text[i])
            score = score + jaccard(untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]),x_test.selected_text[i])
        else:
            x_test.loc[i,'jaccard_score'] = jaccard(x_test.text[i],x_test.selected_text[i])

            score = score + jaccard(x_test.text[i],x_test.selected_text[i])

    else:
        x_test.loc[i,'jaccard_score'] = jaccard(x_test.text[i],x_test.selected_text[i])

        score = score + jaccard(x_test.text[i],x_test.selected_text[i])


    
print(score/x_test.shape[0])        

In [None]:
print('avg positive jaccard score: ',x_test[x_test['sentiment']=='positive'].jaccard_score.mean())

print('avg negative jaccard score: ',x_test[x_test['sentiment']=='negative'].jaccard_score.mean())
print('avg neutral jaccard score: ',x_test[x_test['sentiment']=='neutral'].jaccard_score.mean())


In [None]:
# sample predictions
for i in range(0,x_test.shape[0],100):
    
    temp = tokenizer.tokenize(x_test.loc[i,'text'])
    
    print('text:',x_test.loc[i,'text'])
    print('sentiment',x_test.loc[i,'sentiment'])
    print('actual:',x_test.loc[i,'selected_text'])
    print('predicted:',untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]))
    print('jaccard:',jaccard(x_test.loc[i,'selected_text'],untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1])))



    print('##############################################################################\n')    


# Test Data Predictions

In [None]:
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')


test_text = []

for i in range(test.shape[0]):
    enc = tokenizer.tokenize(test.loc[i,'text'])
    
    test_text.append(enc)
# tokenizing test set.
test_text = text_tokenizer.texts_to_sequences(test_text)
test_text = pad_sequences(test_text, maxlen=max_length, padding='post')
test_sentiment = sentiment_tokenizer.texts_to_sequences(test['sentiment'])

# coverting lists to array.
test_text = np.array(test_text)
test_sentiment = np.array(test_sentiment)

# predicting using best model.

preds = []
test_start,test_end= best_model.predict([test_text,test_sentiment])

for i in range(test.shape[0]):
    temp = tokenizer.tokenize(test.loc[i,'text'])
    if test.loc[i,'sentiment']!='neutral' and np.argmax(test_start[i])<=np.argmax(test_end[i]):
        preds.append(untokenize(temp[np.argmax(test_start[i]):np.argmax(test_end[i])+1]))
    else:
        preds.append(test.text[i])

In [None]:
# creating submission file.
submission = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
submission['selected_text'] = preds

submission.to_csv('submission.csv', index = False)

In [None]:
submission