# Load  data and libraries

In [None]:
import pandas as pd, numpy as np
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.model_selection import StratifiedKFold
from transformers import *
import tokenizers
print('TF version',tf.__version__)

In [None]:
def read_train():
    train=pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
    train['text']=train['text'].str.lower().astype(str)
    train['selected_text']=train['selected_text'].str.lower().astype(str)
    return train

def read_test():
    test=pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
    test['text']=test['text'].str.lower().astype(str)
    return test

def read_submission():
    test=pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')
    return test
    
train_df = read_train()
test_df = read_test()
submission_df = read_submission()

In [None]:
t = train_df[:27000]
v = train_df[27000:]

train_df = t
validation_df = v

validation_df.reset_index(inplace=True,drop=True)

In [None]:
def jaccard(str1, str2): 
    a = set(str(str1).lower().split()) 
    b = set(str(str2).lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# Data preproccesing

In [None]:
MAX_LEN = 100
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1313, 'negative': 2430, 'neutral': 7974}

Create train set

In [None]:
ct = train_df.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(train_df.shape[0]):
    
    # FIND OVERLAP
    text1 = " "+" ".join(train_df.loc[k,'text'].split())
    text2 = " ".join(train_df.loc[k,'selected_text'].split())
    idx = text1.find(text2)
    chars = np.zeros((len(text1)))
    chars[idx:idx+len(text2)]=1
    if text1[idx-1]==' ': chars[idx-1] = 1 
    enc = tokenizer.encode(text1) 
        
    # ID_OFFSETS
    offsets = []; idx=0
    for t in enc.ids:
        w = tokenizer.decode([t])
        offsets.append((idx,idx+len(w)))
        idx += len(w)
    
    # START END TOKENS
    toks = []
    for i,(a,b) in enumerate(offsets):
        sm = np.sum(chars[a:b])
        if sm>0: toks.append(i) 
        
    s_tok = sentiment_id[train_df.loc[k,'sentiment']]
    input_ids[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask[k,:len(enc.ids)+5] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+1] = 1
        end_tokens[k,toks[-1]+1] = 1

Create validation set

In [None]:
ct = validation_df.shape[0]
input_ids_v = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_v = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_v = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(validation_df.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(validation_df.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[validation_df.loc[k,'sentiment']]
    input_ids_v[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_v[k,:len(enc.ids)+5] = 1

Create test set

In [None]:
ct = test_df.shape[0]
input_ids_t = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask_t = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids_t = np.zeros((ct,MAX_LEN),dtype='int32')

for k in range(test_df.shape[0]):
        
    # INPUT_IDS
    text1 = " "+" ".join(test_df.loc[k,'text'].split())
    enc = tokenizer.encode(text1)                
    s_tok = sentiment_id[test_df.loc[k,'sentiment']]
    input_ids_t[k,:len(enc.ids)+5] = [0] + enc.ids + [2,2] + [s_tok] + [2]
    attention_mask_t[k,:len(enc.ids)+5] = 1

# Model

In [None]:
def scheduler(epoch):
    return 3e-5 * 0.2**epoch

In [None]:
def build_model():
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    tok = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)

    config = RobertaConfig.from_pretrained(PATH+'config-roberta-base.json')
    bert_model = TFRobertaModel.from_pretrained(PATH+'pretrained-roberta-base.h5',config=config)
    x = bert_model(ids,attention_mask=att,token_type_ids=tok)
    
    
    x1 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x1 = tf.keras.layers.Conv1D(128, 2,padding='same')(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Conv1D(64, 2,padding='same')(x1)
    x1 = tf.keras.layers.Dense(2)(x1)
    x1 = tf.keras.layers.LeakyReLU()(x1)
    x1 = tf.keras.layers.Dense(1)(x1)
    x1 = tf.keras.layers.Flatten()(x1)
    x1 = tf.keras.layers.Activation('softmax')(x1)
    
    x2 = tf.keras.layers.Dropout(0.1)(x[0]) 
    x2 = tf.keras.layers.Conv1D(128, 2, padding='same')(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Conv1D(64, 2, padding='same')(x2)
    x2 = tf.keras.layers.Dense(2)(x2)
    x2 = tf.keras.layers.LeakyReLU()(x2)
    x2 = tf.keras.layers.Dense(1)(x2)
    x2 = tf.keras.layers.Flatten()(x2)
    x2 = tf.keras.layers.Activation('softmax')(x2)

    model = tf.keras.models.Model(inputs=[ids, att, tok], outputs=[x1,x2])
    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)

    return model

# Train
We will skip this stage and load already trained model

In [None]:
!ls

In [None]:
n_splits = 4

In [None]:
jac = []; VER='v1'; DISPLAY=1 # USE display=1 FOR INTERACTIVE
oof_start = np.zeros((input_ids.shape[0],MAX_LEN))
oof_end = np.zeros((input_ids.shape[0],MAX_LEN))

skf = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=777)
for fold,(idxT,idxV) in enumerate(skf.split(input_ids,train_df.sentiment.values)):

    print('#'*25)
    print('### FOLD %i'%(fold+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
        
    reduce_lr = tf.keras.callbacks.LearningRateScheduler(scheduler)

    sv = tf.keras.callbacks.ModelCheckpoint(
        '%s-roberta-%i.h5'%(VER,fold), monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='auto', save_freq='epoch')
        
    hist = model.fit([input_ids[idxT,], attention_mask[idxT,], token_type_ids[idxT,]], [start_tokens[idxT,], end_tokens[idxT,]], 
        epochs=4, batch_size=8, verbose=DISPLAY, callbacks=[sv, reduce_lr],
        validation_data=([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]], 
        [start_tokens[idxV,], end_tokens[idxV,]]))
    
    print('Loading model...')
    model.load_weights('./v1-roberta-%i.h5'%(fold))
    
    print('Predicting OOF...')
    oof_start[idxV,],oof_end[idxV,] = model.predict([input_ids[idxV,],attention_mask[idxV,],token_type_ids[idxV,]],verbose=DISPLAY)
    
    # DISPLAY FOLD JACCARD
    all = []
    for k in idxV:
        start_sorted_indexes = np.argsort(-oof_start[k,], kind='quicksort', order=None)
        end_sorted_indexes   = np.argsort(-oof_end[k,], kind='quicksort', order=None)

        if start_sorted_indexes[0] <= end_sorted_indexes[0]:
            a = start_sorted_indexes[0]
            b = end_sorted_indexes[0]
            text1 = " "+" ".join(train_df.loc[k,'text'].split())
            #text1 = revert_clean(text1)
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
            
        elif start_sorted_indexes[0] <= end_sorted_indexes[1]:
            a = start_sorted_indexes[0]
            b = end_sorted_indexes[1]
            text1 = " "+" ".join(train_df.loc[k,'text'].split())
            #text1 = revert_clean(text1)
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])

        elif start_sorted_indexes[1] <= end_sorted_indexes[0]:
            a = start_sorted_indexes[1]
            b = end_sorted_indexes[0]
            text1 = " "+" ".join(train_df.loc[k,'text'].split())
            #text1 = revert_clean(text1)
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
            
        elif start_sorted_indexes[1] <= end_sorted_indexes[1]:
            a = start_sorted_indexes[1]
            b = end_sorted_indexes[1]
            text1 = " "+" ".join(train_df.loc[k,'text'].split())
            #text1 = revert_clean(text1)
            enc = tokenizer.encode(text1)
            st = tokenizer.decode(enc.ids[a-1:b])
        
        else:
            st = train_df.loc[k,'text']
            #st = revert_clean(st)

        all.append(jaccard(st,train_df.loc[k,'selected_text']))
    jac.append(np.mean(all))
    print('>>>> FOLD %i Jaccard ='%(fold+1),np.mean(all))
    print()

# Inference

Predicting test

In [None]:
preds_start = np.zeros((input_ids_t.shape[0],MAX_LEN))
preds_end = np.zeros((input_ids_t.shape[0],MAX_LEN))
DISPLAY=1
for i in range(n_splits):
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
    model.load_weights('./v1-roberta-%i.h5'%(i))

    print('Predicting Test...')
    preds = model.predict([input_ids_t,attention_mask_t,token_type_ids_t],verbose=DISPLAY)
    preds_start += preds[0]/n_splits
    preds_end += preds[1]/n_splits

In [None]:
all = []
counter = 0
for k in range(input_ids_t.shape[0]):
    start_sorted_indexes = np.argsort(-preds_start[k,], kind='quicksort', order=None)
    end_sorted_indexes   = np.argsort(-preds_end[k,], kind='quicksort', order=None)

    if start_sorted_indexes[0] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(test_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[0] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(test_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(test_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(test_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    else:
        counter += 1
        st = test_df.loc[k,'text']

    all.append(st)
    
print(counter, " row cant predicted on test set")

In [None]:
test_df['selected_text'] = all
test_df[['textID','selected_text']].to_csv('submission.csv',index=False)

Predicting Train

In [None]:
train_preds_start = np.zeros((input_ids.shape[0],MAX_LEN))
train_preds_end = np.zeros((input_ids.shape[0],MAX_LEN))
DISPLAY=1
for i in range(n_splits):
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
    model.load_weights('./v1-roberta-%i.h5'%(i))

    print('Predicting Train...')
    train_preds = model.predict([input_ids,attention_mask,token_type_ids],verbose=DISPLAY)
    train_preds_start += train_preds[0]/n_splits
    train_preds_end += train_preds[1]/n_splits

In [None]:
all = []
counter = 0
for k in range(input_ids.shape[0]):
    start_sorted_indexes = np.argsort(-train_preds_start[k,], kind='quicksort', order=None)
    end_sorted_indexes   = np.argsort(-train_preds_end[k,], kind='quicksort', order=None)

    if start_sorted_indexes[0] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(train_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[0] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(train_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(train_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(train_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    else:
        counter += 1
        st = train_df.loc[k,'text']

    all.append(st)

print(counter, " row cant predicted on train set")

In [None]:
train_df['predicted_text'] = all

Predicting Validation

In [None]:
validation_preds_start = np.zeros((input_ids_v.shape[0],MAX_LEN))
validation_preds_end = np.zeros((input_ids_v.shape[0],MAX_LEN))
DISPLAY=1
for i in range(n_splits):
    print('#'*25)
    print('### MODEL %i'%(i+1))
    print('#'*25)
    
    K.clear_session()
    model = build_model()
    model.load_weights('./v1-roberta-%i.h5'%(i))

    print('Predicting Validation...')
    validation_preds = model.predict([input_ids_v,attention_mask_v,token_type_ids_v],verbose=DISPLAY)
    validation_preds_start += validation_preds[0]/n_splits
    validation_preds_end += validation_preds[1]/n_splits

In [None]:
all = []
counter = 0
for k in range(input_ids_v.shape[0]):
    start_sorted_indexes = np.argsort(-validation_preds_start[k,], kind='quicksort', order=None)
    end_sorted_indexes   = np.argsort(-validation_preds_end[k,], kind='quicksort', order=None)

    if start_sorted_indexes[0] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(validation_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[0] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[0]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(validation_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[0]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[0]
        text1 = " "+" ".join(validation_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    elif start_sorted_indexes[1] <= end_sorted_indexes[1]:
        a = start_sorted_indexes[1]
        b = end_sorted_indexes[1]
        text1 = " "+" ".join(validation_df.loc[k,'text'].split())
        enc = tokenizer.encode(text1)
        st = tokenizer.decode(enc.ids[a-1:b])

    else:
        counter += 1
        st = validation_df.loc[k,'text']

    all.append(st)

print(counter, " row cant predicted on validation set")

In [None]:
validation_df['predicted_text'] = all

Only roberta score on validation

Expected value is ~0.711

In [None]:
cumulativeJaccard = []

for index, row in validation_df.iterrows():
    cumulativeJaccard.append(jaccard(row["predicted_text"],row['selected_text']))
    
print("Roberta Jaccard on validation = ", np.mean(cumulativeJaccard), " Expected value is ~0.711")

# Preparing Data For NER

In [None]:
train_df

In [None]:
validation_df

In [None]:
test_df

Prepare Train

In [None]:
text_se = []
selected_text_se = []
sentiment_se = []
position = []
for index, row in train_df.iterrows():
    text = " "+" ".join(row['text'].split())
    selected_text = " ".join(row["selected_text"].split())
    
    selected_text_start_word = selected_text.split()[0]
    selected_text_end_word = selected_text.split()[-1]
    
    selected_text_start_char = text.find(selected_text)
    selected_text_end_char = selected_text_start_char + len(selected_text)
    
    # These words are from text not selected_text. So they can be different from selected_text words
    selected_text_full_start_word = None
    selected_text_full_end_word = None
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1
        
        if selected_text_start_char < char_counter:
            # We find start word in text
            selected_text_full_start_word = word
            break
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1

        if selected_text_end_char <= char_counter:
            # We find start word in text
            selected_text_full_end_word = word
            break

    if row["sentiment"] == "neutral":
        position.append("Start-Neutral")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End-Neutral")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    else:
        position.append("Start")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    
train_df_se = pd.DataFrame(data={"text_se":text_se, "selected_text_se":selected_text_se, "sentiment":sentiment_se, "position":position})

train_df_se

In [None]:
# Error Check
for index, row in train_df_se.iterrows():
    if row["selected_text_se"] not in row["text_se"]:
        print("Error at = " + str(index), row["selected_text_se"], row["text_se"],)

In [None]:
# Different words
counter = 0
for index, row in train_df_se.iterrows():
    if row["selected_text_se"] != row["text_se"]:
        counter += 1
        print("Sentiment     =  ",row["sentiment"].upper() + "--" +row["position"])
        print("Real word     =  ",row["text_se"])
        print("Selected word =  ",row["selected_text_se"])
        print("==========================================")
print("Different word number in train set =", counter)

Different word number in train set 3362, 346 word is **neutral**

Prepare Validation

In [None]:
text_se = []
selected_text_se = []
sentiment_se = []
position = []
for index, row in validation_df.iterrows():
    text = " "+" ".join(row['text'].split())
    selected_text = " ".join(row["predicted_text"].split())
    
    selected_text_start_word = selected_text.split()[0]
    selected_text_end_word = selected_text.split()[-1]
    
    selected_text_start_char = text.find(selected_text)
    selected_text_end_char = selected_text_start_char + len(selected_text)
    
    # These words are from text not selected_text. So they can be different from selected_text words
    selected_text_full_start_word = None
    selected_text_full_end_word = None
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1
        
        if selected_text_start_char < char_counter:
            # We find start word in text
            selected_text_full_start_word = word
            break
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1

        if selected_text_end_char <= char_counter:
            # We find start word in text
            selected_text_full_end_word = word
            break

    if row["sentiment"] == "neutral":
        position.append("Start-Neutral")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End-Neutral")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    else:
        position.append("Start")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    
validation_df_se = pd.DataFrame(data={"text_se":text_se, "selected_text_se":selected_text_se, "sentiment":sentiment_se, "position":position})

validation_df_se

In [None]:
validation_df_se.loc[96:98]

In [None]:
validation_df.loc[48:49]

In [None]:
# Error Check
for index, row in validation_df_se.iterrows():
    if row["selected_text_se"] not in row["text_se"]:
        print("Error at = " + str(index), row["selected_text_se"], row["text_se"],)

Prepare Test

In [None]:
text_se = []
selected_text_se = []
sentiment_se = []
position = []
for index, row in test_df.iterrows():
    text = " "+" ".join(row['text'].split())
    selected_text = " ".join(row["selected_text"].split())
    
    selected_text_start_word = selected_text.split()[0]
    selected_text_end_word = selected_text.split()[-1]
    
    selected_text_start_char = text.find(selected_text)
    selected_text_end_char = selected_text_start_char + len(selected_text)
    
    # These words are from text not selected_text. So they can be different from selected_text words
    selected_text_full_start_word = None
    selected_text_full_end_word = None
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1
        
        if selected_text_start_char < char_counter:
            # We find start word in text
            selected_text_full_start_word = word
            break
    
    char_counter = 0
    for word in row["text"].split():
        char_counter += len(word)+1

        if selected_text_end_char <= char_counter:
            # We find start word in text
            selected_text_full_end_word = word
            break

    if row["sentiment"] == "neutral":
        position.append("Start-Neutral")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End-Neutral")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    else:
        position.append("Start")
        text_se.append(selected_text_full_start_word)
        selected_text_se.append(selected_text_start_word)
        position.append("End")
        text_se.append(selected_text_full_end_word)
        selected_text_se.append(selected_text_end_word)
        sentiment_se.append(row["sentiment"])
        sentiment_se.append(row["sentiment"])
    
test_df_se = pd.DataFrame(data={"text_se":text_se, "selected_text_se":selected_text_se, "sentiment":sentiment_se, "position":position})

test_df_se

In [None]:
# Error Check
for index, row in test_df_se.iterrows():
    if row["selected_text_se"] not in row["text_se"]:
        print("Error at = " + str(index), row["selected_text_se"], row["text_se"],)

In [None]:
# Different words
counter = 0
for index, row in test_df_se.iterrows():
    if row["selected_text_se"] != row["text_se"]:
        counter += 1
        print("Sentiment     =  ",row["sentiment"].upper() + "--" +row["position"])
        print("Real word     =  ",row["text_se"])
        print("Selected word =  ",row["selected_text_se"])
        print("==========================================")
print("Different word number in test set =", counter)

# NER

In [None]:
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import nltk
import time
import spacy 
spacy.prefer_gpu()
import random
from spacy.util import compounding
from spacy.util import minibatch

In [None]:
def save_model(output_dir, nlp, new_model_name):
    ''' This Function Saves model to 
    given output directory'''
    
    output_dir = f'../working/{output_dir}'
    if output_dir is not None:        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        nlp.meta["name"] = new_model_name
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

In [None]:
def train(train_data, output_dir, n_iter=20, cont=False):
    """Load the model, set up the pipeline and train the entity recognizer."""
    ""
    if cont and os.path.exists(output_dir):
        nlp = spacy.load(output_dir)  # load existing spaCy model
        print("Loaded model '%s'" % output_dir)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        if cont and os.path.exists(output_dir):
            nlp.resume_training()
        else:
            nlp.begin_training()


        for itn in tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data, size=compounding(4.0, 500.0, 1.001))    
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=0.5,   # dropout - make it harder to memorise data
                            losses=losses, 
                            )
            print("Losses", losses)
    save_model(output_dir, nlp, 'st_ner')

In [None]:
def get_model_out_path(position):
    '''
    Returns Model output path
    '''
    model_out_path = None
    if position == 'Start':
        model_out_path = 'models/model_Start'
    elif position == 'End':
        model_out_path = 'models/model_End'
    return model_out_path

In [None]:
def get_training_data(data, position):
    '''
    Returns Trainong data in the format needed to train spacy NER
    '''
    train_data = []
    for index, row in data.iterrows():
        if row.position == position:
            selected_text = row.selected_text_se
            text = row.text_se
            start = text.find(selected_text)
            end = start + len(selected_text)
            train_data.append((text, {"entities": [[start, end, 'selected_text']]}))
    return train_data

In [None]:
position = 'Start'

train_data = get_training_data(train_df_se, position)
model_path = get_model_out_path(position)

train(train_data, model_path, n_iter=3, cont=False)

In [None]:
position = 'End'

train_data = get_training_data(train_df_se, position)
model_path = get_model_out_path(position)

train(train_data, model_path, n_iter=3, cont=False)

In [None]:
def predict_entities(text, model):
    doc = model(text)
    ent_array = []
    for ent in doc.ents:
        start = text.find(ent.text)
        end = start + len(ent.text)
        new_int = [start, end, ent.label_]
        if new_int not in ent_array:
            ent_array.append([start, end, ent.label_])
    if len(ent_array) > 0:
        selected_text = text[ent_array[0][0]: ent_array[0][1]]
    else:
        selected_text = text
    return selected_text

In [None]:
selected_texts = []
MODELS_BASE_PATH = '../working/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_Start = spacy.load(MODELS_BASE_PATH + 'model_Start')
    model_End = spacy.load(MODELS_BASE_PATH + 'model_End')
        
    for index, row in validation_df_se.iterrows():
        text = row.text_se            # ----------------------------------------------------------------------------------
        output_str = ""
        
        if row.position == 'Start-Neutral' or row.position == 'End-Neutral':
            selected_texts.append(text)
        elif row.position == 'Start':
            selected_texts.append(predict_entities(text, model_Start))
        elif row.position == 'End':
            selected_texts.append(predict_entities(text, model_End))
        
validation_df_se['ner_predicted_text'] = selected_texts

In [None]:
validation_df_se

In [None]:
# Different words
counter = 0
for index, row in validation_df_se.iterrows():
    if row["selected_text_se"] != row["text_se"]:
        counter += 1
print("Different word count in validation set =",counter)

In [None]:
counter = 0
for index, row in validation_df_se.iterrows():
    if row["text_se"] == row["selected_text_se"]:
        counter+=1
print("Same word count in validation set =",counter)

In [None]:
counter = 0
for index, row in validation_df_se.iterrows():
    if row["ner_predicted_text"] != row["text_se"]:
        counter+=1
print("Changed word by NER in validation set =",counter)

In [None]:
counter = 0
for index, row in validation_df_se.iterrows():
    if row["ner_predicted_text"] != row["text_se"]:
        if row["ner_predicted_text"] != row["selected_text_se"]:
            counter+=1
print("Changed word by NER in validation set but unsuccesfull changing=",counter)

In [None]:
predict_entities("badly??!?!??!!?", model_End)

In [None]:
for index, row in validation_df_se.iterrows():
    if row["text_se"] == "badly??!?!??!!?":
        print(row)

In [None]:
for index, row in validation_df_se.iterrows():
    if row["selected_text_se"] != row["text_se"]:
        print(row["text_se"], row["selected_text_se"])

In [None]:
for index, row in validation_df.iterrows():
     
    predicted_words = row["predicted_text"].split()
    predicted_words[0] = validation_df_se["ner_predicted_text"][2*index]
    predicted_words[-1] = validation_df_se["ner_predicted_text"][(2*index)+1]
    
    validation_df["predicted_text"][index] = " ".join(predicted_words)

In [None]:
validation_df

In [None]:
totalJaccard = 0
for index, row in validation_df.iterrows():
    predict = row.predicted_text
    true = row.selected_text

    j = jaccard(predict, true)

    totalJaccard += j

meanJaccard = totalJaccard / len(validation_df['predicted_text'])
print("NER Validation results : ", meanJaccard)

In [None]:
test_df

In [None]:
test_df_se

In [None]:
test_df

In [None]:
selected_texts = []
MODELS_BASE_PATH = '../working/models/'

if MODELS_BASE_PATH is not None:
    print("Loading Models  from ", MODELS_BASE_PATH)
    model_Start = spacy.load(MODELS_BASE_PATH + 'model_Start')
    model_End = spacy.load(MODELS_BASE_PATH + 'model_End')
        
    for index, row in test_df_se.iterrows():
        text = row.text_se
        output_str = ""
        
        if row.position == 'Start-Neutral' or row.position == 'End-Neutral':
            selected_texts.append(text)
        elif row.position == 'Start':
            selected_texts.append(predict_entities(text, model_Start))
        elif row.position == 'End':
            selected_texts.append(predict_entities(text, model_End))
        
test_df_se['ner_predicted_text'] = selected_texts

In [None]:
for index, row in test_df.iterrows():
     
    predicted_words = row["selected_text"].split()
    predicted_words[0] = test_df_se["ner_predicted_text"][2*index]
    predicted_words[-1] = test_df_se["ner_predicted_text"][(2*index)+1]
    
    test_df["selected_text"][index] = " ".join(predicted_words)

In [None]:
test_df

In [None]:
test_df[['textID','selected_text']].to_csv('submission.csv',index=False)