In [1]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers

In [2]:
train=pd.read_csv('tweet-sentiment-extraction-data/train.csv')
train['text']=train['text'].astype(str)
train['selected_text']=train['selected_text'].astype(str)

In [3]:
test=pd.read_csv('tweet-sentiment-extraction-data/test.csv')
test['text']=test['text'].astype(str)

In [4]:
def jaccard(str1, str2): 
    a = set(str(str1).lower().split()) 
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [5]:
train.dropna(axis=0, how='any', inplace=True)
test.dropna(axis=0, how='any', inplace=True)

In [6]:
MAX_LEN = 165
PATH = 'tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
vocab_file=PATH + 'vocab-roberta-base.json',
merges_file=PATH + 'merges-roberta-base.txt',
lowercase=True,
add_prefix_space=True)

In [7]:
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

In [8]:
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

In [9]:
for k in range(train.shape[0]):
    
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ':
        chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
    
    offsets = []; idx=0
    for t in enc.ids:
        
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
        
        toks = []
        for i,(a,b) in enumerate(offsets):
            sm = np.sum(chars[a:b])
            if sm>0: toks.append(i) 
                
        s_tok = sentiment_id[train.loc[k,'sentiment']]
        input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
        attention_mask[k,:len(enc.ids)+5] = 1
        if len(toks)>0:
            start_tokens[k,toks[0]+1] = 1
            end_tokens[k,toks[-1]+1] = 1
            
            

In [10]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(256, 5,padding='same')(x[0])
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(128, 5,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 5,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(5)(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(256, 5, padding='same')(x[0])
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(128, 5, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 5, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(5)(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=0.15), optimizer=optimizer)
    return model

In [13]:
n_splits = 5

In [17]:
jac = [];VER='v4';
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
skf = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=777)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):
    K.clear_session()
    model = build_model()
    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
    hist = model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=5, batch_size=8, verbose=1, callbacks=[sv],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))

Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.99663, saving model to v4-roberta-0.h5
Epoch 2/5
Epoch 00002: val_loss improved from 0.99663 to 0.99549, saving model to v4-roberta-0.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.99549
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.99549
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.99549
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.99199, saving model to v4-roberta-1.h5
Epoch 2/5
Epoch 00002: val_loss improved from 0.99199 to 0.99174, saving model to v4-roberta-1.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.99174
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.99174
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.99174
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.99244, saving model to v4-roberta-2.h5
Epoch 2/5
Epoch 00002: val_loss improved from 0.99244 to 0.99221, saving model to v4-roberta-2.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.992

Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.99406, saving model to v4-roberta-3.h5
Epoch 2/5
Epoch 00002: val_loss improved from 0.99406 to 0.99343, saving model to v4-roberta-3.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.99343
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.99343
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.99343
Epoch 1/5
Epoch 00001: val_loss improved from inf to 0.99469, saving model to v4-roberta-4.h5
Epoch 2/5
Epoch 00002: val_loss improved from 0.99469 to 0.99448, saving model to v4-roberta-4.h5
Epoch 3/5
Epoch 00003: val_loss did not improve from 0.99448
Epoch 4/5
Epoch 00004: val_loss did not improve from 0.99448
Epoch 5/5
Epoch 00005: val_loss did not improve from 0.99448


In [14]:
c_test = test.shape[0]
input_ids_t = np.ones((c_test,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((c_test,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((c_test,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1


In [15]:
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

for i in range(5):
    model = build_model()
    model.load_weights('v4-roberta-%i.h5'%i)
    
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=1)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits



In [17]:
model.summary()

Model: "model_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 165)]        0                                            
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, 165)]        0                                            
__________________________________________________________________________________________________
input_18 (InputLayer)           [(None, 165)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model_5 (TFRobertaMo ((None, 165, 768), ( 124645632   input_16[0][0]                   
____________________________________________________________________________________________

In [18]:


all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])
    all.append(st)



In [19]:
test['selected_text'] = all
test

Unnamed: 0,textID,text,sentiment,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,last
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,shame
3,01082688c6,happy bday!,positive,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,i like
...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,tired
3530,416863ce47,All alone in this old house again. Thanks for...,positive,thanks
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,depression...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,love


In [21]:
test[['textID','selected_text']].to_csv('submission.csv',index=False)