<a href="https://colab.research.google.com/github/sangeetsaurabh/tweet_sentiment_extraction/blob/master/keras_transformer/keras_roberta_implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Keras Roberta transformer implementation

#### Install the right modules

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 4.5MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 13.0MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 29.4MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K   

In [2]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
import math
print('TF version',tf.__version__)

TF version 2.2.0


### Set up the path

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
data_folder = "/content/drive/My Drive/tweet-sentiment-extraction/data"
keras_folder = "/content/drive/My Drive/tweet-sentiment-extraction/keras/"
tmp_folder = '/tmp'

In [None]:
import shutil
shutil.copyfile('/content/drive/My Drive/tweet-sentiment-extraction/data/roberta-large-merges.txt', '/tmp/roberta-large-merges.txt')
shutil.copyfile('/content/drive/My Drive/tweet-sentiment-extraction/data/roberta-large-vocab.json', '/tmp/roberta-large-vocab.json')
shutil.copyfile('/content/drive/My Drive/tweet-sentiment-extraction/data/roberta-base-config.json', '/tmp/roberta-base-config.json')


'/tmp/roberta-base-config.json'

### Prepare the input

In [None]:
MAX_LEN = 96
PATH = '/tmp/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'roberta-large-vocab.json', 
    merges_file=PATH+'roberta-large-merges.txt', 
    lowercase=True,
    add_prefix_space=True
)
EPOCHS = 3 # originally 3
BATCH_SIZE = 32 # originally 32
PAD_ID = 1
SEED = 88888
LABEL_SMOOTHING = 0.1
tf.random.set_seed(SEED)
np.random.seed(SEED)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}
train = pd.read_csv(data_folder + '/train.csv').fillna('')
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [None]:
tokenizer.get_vocab_size()

50265

In [None]:
config = RobertaConfig.from_pretrained("roberta-base",output_hidden_states=True)
bert_model = TFRobertaModel.from_pretrained("roberta-base",config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=481.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=657434796.0, style=ProgressStyle(descri…




In [None]:
config.to_json_file('/tmp/roberta-config.json')

In [None]:
bert_model.save_pretrained(tmp_folder)

#### Convert to training data based upon Roberta input requirements

In [None]:
###Change here
ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train.loc[k,'text'].split())
    text2 = " ".join(train.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+5] = [0, s_tok] +[2,2] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+4] = 1
        end_tokens[k,toks[-1]+4] = 1

#### Prepare the test data as well

In [None]:
###Change here
test = pd.read_csv(data_folder + '/test.csv').fillna('')

ct = test.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0, s_tok] + [2,2] + enc.ids + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

#### Build Roberta Model

In [None]:
import pickle

def save_weights(model, dst_fn):
    weights = model.get_weights()
    with open(dst_fn, 'wb') as f:
        pickle.dump(weights, f)


def load_weights(model, weight_fn):
    with open(weight_fn, 'rb') as f:
        weights = pickle.load(f)
    model.set_weights(weights)
    return model

def loss_fn(y_true, y_pred):
    # adjust the targets for sequence bucketing
    ll = tf.shape(y_pred)[1]
    y_true = y_true[:, :ll]
    loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred,
        from_logits=False, label_smoothing=LABEL_SMOOTHING)
    loss = tf.reduce_mean(loss)
    return loss


def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    #tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    padding = tf.cast(tf.equal(ids, PAD_ID), tf.int32)

    lens = MAX_LEN - tf.reduce_sum(padding, -1)
    max_len = tf.reduce_max(lens)
    ids_ = ids[:, :max_len]
    att_ = att[:, :max_len]
    #tok_ = tok[:, :max_len]

    config = RobertaConfig.from_pretrained("roberta-base",output_hidden_states=True)
    bert_model = TFRobertaModel.from_pretrained("roberta-base",config=config)
    #x = bert_model(ids_,attention_mask=att_,token_type_ids=tok_)
    x = bert_model(ids_,attention_mask=att_)

    x = tf.stack([x[2][-1], x[2][-2], x[2][-3], x[2][-4]])
    x = tf.reduce_mean(x, 0)
    
    #x1 = tf.keras.layers.Dropout(0.1)(x[0])
    x1 = tf.keras.layers.Dropout(0.1)(x)
    x1 = tf.keras.layers.Conv1D(768, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    #x2 = tf.keras.layers.Dropout(0.1)(x[0])
    x2 = tf.keras.layers.Dropout(0.1)(x)
    x2 = tf.keras.layers.Conv1D(768, 2,padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    #model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    model = tf.keras.models.Model(inputs=[ids, att], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5) 
    model.compile(loss=loss_fn, optimizer=optimizer)
    
    # this is required as `model.predict` needs a fixed size!
    x1_padded = tf.pad(x1, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    x2_padded = tf.pad(x2, [[0, 0], [0, MAX_LEN - max_len]], constant_values=0.)
    
    #padded_model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1_padded,x2_padded])
    padded_model = tf.keras.models.Model(inputs=[ids, att], outputs=[x1_padded,x2_padded])
    return model, padded_model

#### Performance Metrics

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
test.shape

(3534, 3)

#### Train the Roberta model

In [None]:
jac = []; VER='v0'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=SEED) #originally 5 splits
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model, padded_model = build_model()
        
    #sv = tf.keras.callbacks.ModelCheckpoint(
    #    '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
    #    save_weights_only=True, mode='auto', save_freq='epoch')
    inpT = [input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]]
    targetT = [start_tokens[idxT,], end_tokens[idxT,]]
    inpV = [input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]]
    targetV = [start_tokens[idxV,], end_tokens[idxV,]]
    # sort the validation data
    shuffleV = np.int32(sorted(range(len(inpV[0])), key=lambda k: (inpV[0][k] == PAD_ID).sum(), reverse=True))
    inpV = [arr[shuffleV] for arr in inpV]
    targetV = [arr[shuffleV] for arr in targetV]
    weight_fn = keras_folder + '%s-roberta-%i.h5'%(VER,fold)
    #weight_fn = 'roberta.h5'
    #for epoch in range(1, EPOCHS + 1):
    #    # sort and shuffle: We add random numbers to not have the same order in each epoch
    #    shuffleT = np.int32(sorted(range(len(inpT[0])), key=lambda k: (inpT[0][k] == PAD_ID).sum() + np.random.randint(-3, 3), reverse=True))
    #    # shuffle in batches, otherwise short batches will always come in the beginning of each epoch
    #    num_batches = math.ceil(len(shuffleT) / BATCH_SIZE)
    #    batch_inds = np.random.permutation(num_batches)
    #    shuffleT_ = []
    #    for batch_ind in batch_inds:
    #        shuffleT_.append(shuffleT[batch_ind * BATCH_SIZE: (batch_ind + 1) * BATCH_SIZE])
    #    shuffleT = np.concatenate(shuffleT_)
        # reorder the input data
    #    inpT = [arr[shuffleT] for arr in inpT]
    #    targetT = [arr[shuffleT] for arr in targetT]
    #    model.fit(inpT, targetT, 
    #        epochs=epoch, initial_epoch=epoch - 1, batch_size=BATCH_SIZE, verbose=DISPLAY, callbacks=[],
    #        validation_data=(inpV, targetV), shuffle=False)  # don't shuffle in `fit`
        #break
    #    save_weights(model, weight_fn)

    print('Loading model...')
    # model.load_weights('%s-roberta-%i.h5'%(VER,fold))
    load_weights(model, weight_fn)

    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = padded_model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    print('Predicting Test...')
    preds = padded_model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/skf.n_splits
    preds_end += preds[1]/skf.n_splits
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        a = np.argmax(oof_start[k,])
        b = np.argmax(oof_end[k,])
        if a>b: 
            st = train.loc[k,'text'] # IMPROVE CV/LB with better choice here
        else:
            text1 = " "+" ".join(train.loc[k,'text'].split())
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-4:b-3])
        all.append(jaccard(st,train.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

#########################
### FOLD 1
#########################
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 1 Jaccard = 0.7033096580544006

#########################
### FOLD 2
#########################
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 2 Jaccard = 0.7146078719165919

#########################
### FOLD 3
#########################
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 3 Jaccard = 0.6951409694914658

#########################
### FOLD 4
#########################
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 4 Jaccard = 0.6901714369653897

#########################
### FOLD 5
#########################
Loading model...
Predicting OOF...
Predicting Test...
>>>> FOLD 5 Jaccard = 0.70347827466002



#### Submission

In [None]:
all = []
for k in range(input_ids_t.shape[0]):
    a = np.argmax(preds_start[k,])
    b = np.argmax(preds_end[k,])
    if a>b: 
        st = test.loc[k,'text']
    else:
        text1 = " "+" ".join(test.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        print(text1)
        print(enc)
        st = tokenizer.decode(enc.ids[a-2:b-1])
    all.append(st)

NameError: ignored

In [None]:
test['selected_text'] = all
test[['textID','selected_text']].to_csv('submission.csv',index=False)
pd.set_option('max_colwidth', 60)
test.sample(25)

Unnamed: 0,textID,text,sentiment,selected_text
3444,7d32d4866a,You are sooo lucky. My fiance is away w/the Marine Corp...,positive,sooo lucky. my fiance
749,200f2b3566,you shouldnt have to reset more than once if it doesn`...,neutral,nt have to reset more than once if it doesn`t work right...
3136,43c61eb7a3,I can`t find my camera,negative,`t find my camera
26,334954f215,"hey peoples, dont you just hate being grounded haha, im ...",neutral,", dont you just hate being grounded haha, im just sat ea..."
3208,2dfbe0b7fb,Hmmm. http://www.djhero.com/ is down,negative,
2095,8986091fd3,wishes happy mothers` day to all moms out there.,positive,mothers`
843,7dd51bdec1,reading Evermore by Lynn Viehl.,neutral,more by lynn viehl.
172,5ff525b74b,Wondering if i cld make things any worse than they alrea...,negative,i cld make things any worse than they
100,f9382a1f7b,"oooh, sunshine! A patch of sunshine! And it will be gone...",neutral,"oh, sunshine! a patch of sunshine! and it will be gone b..."
930,7720aca199,"No, that`s not right - I remember now. You were in a f...",neutral,that`s not right - i remember now. you were in a fearle...
