In [None]:
import pickle
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold
import tokenizers
import math
import re
from numba import jit
from tqdm import tqdm


MAX_LEN = 120
PATH = '../input/tf-roberta/'
tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab_file=PATH+'vocab-roberta-base.json', 
    merges_file=PATH+'merges-roberta-base.txt', 
    lowercase=True,
    add_prefix_space=True
)
sentiment_id = {'positive': 1809, 'negative': 3392, 'neutral': 14058}

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

# preprocess

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')

text = train['text'].values
selected_text = train['selected_text'].values.copy()
sentiments = train['sentiment'].values

ct = train.shape[0]
input_ids = np.ones((ct,MAX_LEN),dtype='int32')
attention_mask = np.zeros((ct,MAX_LEN),dtype='int32')
token_type_ids = np.zeros((ct,MAX_LEN),dtype='int32')
start_tokens = np.zeros((ct,MAX_LEN),dtype='int32')
end_tokens = np.zeros((ct,MAX_LEN),dtype='int32')

"""
for k in tqdm(range(train.shape[0])):
    ss = text[k].find(selected_text[k])
    if text[k][max(ss - 2, 0):ss] == '  ':
        ss -= 2
    if ss > 0  and text[k][ss - 1] == ' ':
        ss -= 1

    ee = ss + len(selected_text[k])

    #if re.match(r' [^ ]', text[k]) is not None:
    if len(text[k]) > 0 and text[k][0] == ' ':  # 変更箇所 guchio
        #ee -= 1
        front_spaces = re.findall("^ +[^ ]", text[k]) # 変更箇所 guchio
        ee -= front_spaces[0].count(' ') # 変更箇所 guchio
        for cnt_base in re.findall("[^ ]  +[^ ]", text[k][:ee].strip()): # 変更箇所 guchio
            ee -= cnt_base[2:].count(' ') # 変更箇所 guchio
    ss = max(0, ss)
    if '  ' in text[k][:ss] and sentiments[k] != 'neutral':
        text1 = " ".join(text[k].split())
        sel = text1[ss:ee].strip()
        if len(sel) > 1 and sel[-2] == ' ':
            sel = sel[:-2]

        selected_text[k] = sel
        
    text1 = " "+" ".join(text[k].split())
    text2 = " ".join(selected_text[k].split()).lstrip(".,;:")

    idx = text1.find(text2)
    if idx != -1:
        chars = np.zeros((len(text1)))
        chars[idx:idx+len(text2)]=1
        if text1[idx-1]==' ': chars[idx-1] = 1 
    else:
        import pdb;pdb.set_trace()
        chars = np.ones((len(text1)))
    enc = tokenizer.encode(text1) 

    # ID_OFFSETS
    offsets = enc.offsets

    # START END TOKENS
    _toks = []

    for i,(a,b) in enumerate(offsets):
        sm = np.mean(chars[a:b])
        # if (sm > 0.6 and chars[a] != 0):  # こうすると若干伸びるけど...
        if (sm > 0.5 and chars[a] != 0): 
            _toks.append(i) 

    toks = _toks
    s_tok = sentiment_id[sentiments[k]]
    input_ids[k,:len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1
"""

for k in tqdm(range(train.shape[0])):
    ss = text[k].find(selected_text[k])
    if text[k][max(ss - 2, 0):ss] == '  ':
        ss -= 2
    if ss > 0  and text[k][ss - 1] == ' ':
        ss -= 1

    ee = ss + len(selected_text[k])

    if re.match(r' [^ ]', text[k]) is not None:
        ee -= 1

    ss = max(0, ss)
    if '  ' in text[k][:ss] and sentiments[k] != 'neutral':
        text1 = " ".join(text[k].split())
        sel = text1[ss:ee].strip()
        if len(sel) > 1 and sel[-2] == ' ':
            sel = sel[:-2]

        selected_text[k] = sel

    text1 = " "+" ".join(text[k].split())
    text2 = " ".join(selected_text[k].split()).lstrip(".,;:")

    idx = text1.find(text2)
    if idx != -1:
        chars = np.zeros((len(text1)))
        chars[idx:idx+len(text2)]=1
        if text1[idx-1]==' ': chars[idx-1] = 1 
    else:
        import pdb;pdb.set_trace()
        chars = np.ones((len(text1)))
    enc = tokenizer.encode(text1) 

    # ID_OFFSETS
    offsets = enc.offsets

    # START END TOKENS
    _toks = []

    for i,(a,b) in enumerate(offsets):
        sm = np.mean(chars[a:b])
        #if (sm > 0.6 and chars[a] != 0):  # こうすると若干伸びるけど...
        if (sm > 0.5 and chars[a] != 0): 
            _toks.append(i)

    toks = _toks
    s_tok = sentiment_id[sentiments[k]]
    input_ids[k, :len(enc.ids)+3] = [0, s_tok] + enc.ids + [2]
    attention_mask[k,:len(enc.ids)+3] = 1
    if len(toks)>0:
        start_tokens[k,toks[0]+2] = 1
        end_tokens[k,toks[-1]+2] = 1  


# Postprocess functions

In [None]:

import re
def modify_punc_length(text, selected_text):
    m = re.search(r'[!\.\?]+$', selected_text)        
    if m is None:
        return selected_text
    
    conti_punc = len(m.group())

    if conti_punc >= 4:
        selected_text = selected_text[:-(conti_punc-2)]
    elif conti_punc == 1:# 元のtextを探しに行く
        tmp = re.sub(r"([\\\*\+\.\?\{\}\(\)\[\]\^\$\|])", r"\\\g<0>", selected_text)
        pat = re.sub(r" ", " +", tmp)
        m = re.search(pat, text)
        f_idx0 = m.start()
        f_idx1 = m.end()

        if f_idx1 != len(text) and text[f_idx1] in ("!", ".", "?"):
            f_idx1 += 1
            selected_text = text[f_idx0:f_idx1]
    return selected_text


import math
def postprocess(row):
    if row.original_text == '':
        return row.normalized_text.strip()
    original_text = row.original_text.replace('\t', '')
    y_start_char = row.y_start_char
    y_end_char = row.y_end_char
    y_selected_text = row.normalized_text[y_start_char:y_end_char].strip()
    if (y_end_char < len(row.normalized_text) and row.sentiment != 'neutral' and
        y_selected_text[-1] == '.' and
        (row.normalized_text[y_end_char] == '.' or 
         y_selected_text[-2] == '.')):
        y_selected_text = re.sub('\.+$', '..', y_selected_text)

    tmp = re.sub(r"([\\\*\+\.\?\{\}\(\)\[\]\^\$\|])", r"\\\g<0>", y_selected_text)
    pat = re.sub(r" ", " +", tmp)
    m = re.search(pat, original_text)
    if m is None:
        print(row.normalized_text[y_start_char:y_end_char].strip())
        print(row.normalized_text)
        print(y_selected_text)
    ss2 = m.start()
    ee2 = m.end()
    
    # 'neutral' およびほぼ文書全体が抜き出されるもの
    if row.sentiment == 'neutral' or ((ee2 - ss2) / len(original_text) > 0.75 and  (ee2 - ss2) > 9):
        if len(original_text) > 0 and original_text[0] != '_' and ss2 < 5:
            ss2 = 0 
        if (ee2 < len(original_text)-1 and original_text[ee2:ee2+2] in ('..', '!!', '??', '((', '))')):
            ee2 += 1
        st =  original_text[ss2:ee2].lstrip(' ½¿')
        y_selected_text = st #re.sub(r' .$', '', st)#.strip('`') ###  この一行追加
                
    else:
        if original_text[:int((ss2+ee2) * 0.5) + 1].count('  ') > 0:
            ss = y_start_char
            ee = y_end_char + 1
            if ss > 1 and original_text[ss-1:ss+1] == '..' and  original_text[ss+1] != '.':
                ss -= 1
            st = original_text[ss:ee]#.lstrip(' ½¿')
            y_selected_text = re.sub(r' .$', '', st)#.strip('`') ###  この一行追加
        else:
            if (ee2 < len(original_text)-1 and original_text[ee2:ee2+2] in ('..', '!!', '??', '((', '))')):
                ee2 += 1
            # 先頭の空白分後退
            if  original_text[0] == ' ':
                ss2 -= 1

            y_selected_text = original_text[ss2:ee2].strip(' ½')

            if row.normalized_text[:y_end_char + 5] == " " + row.original_text[:ee2 + 4]: # 簡単のため、長さが同じ場合に限定している
                y_selected_text = modify_punc_length(original_text, y_selected_text)
            
            
    return y_selected_text

# Inference

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')

text = train['text'].values
selected_text = train['selected_text'].values.copy()
sentiments = train['sentiment'].values
ids = train['textID'].values

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


all = []
list_st = []
all_nn = []
all_p = []
all_n = []

from collections import namedtuple
Row = namedtuple('Row', ['original_text', 'normalized_text', 'sentiment', 'y_start_char', 'y_end_char'])
for k in tqdm(list(range(train.shape[0]))):
    text0 = text[k]
    text1 = " " + " ".join(text[k].split())
    enc = tokenizer.encode(text1)

    aa = np.argmax(start_tokens[k])
    bb = np.argmax(end_tokens[k])

    ss = enc.offsets[aa - 2][0]
    ee = enc.offsets[bb - 2][1] 
    st = text1[ss:ee].strip()

    row = Row(
        original_text=text0,
        normalized_text=text1,
        sentiment=sentiments[k],
        y_start_char=ss,
        y_end_char=ee,
    )
    try:
        st = postprocess(row)
    except Exception as e:
        raise e
        print(k)

    list_st.append(st)
    sc = jaccard(st,selected_text[k])
    if sentiments[k] == 'neutral':
        all_nn.append(sc)
    elif sentiments[k] == 'positive':
        all_p.append(sc)
    else:
        all_n.append(sc)

    all.append(sc)
print(a, b, '>>>> FOLD Jaccard all =',np.mean(all))#, np.mean(all_nn), np.mean(all_p), np.mean(all_n))
print('>>>> FOLD Jaccard neutral =',np.mean(all_nn))
print('>>>> FOLD Jaccard positive =',np.mean(all_p))
print('>>>> FOLD Jaccard negative =',np.mean(all_n))

* >>>> FOLD Jaccard all = 0.9798332533206786
* >>>> FOLD Jaccard neutral = 0.9961648726550028
* >>>> FOLD Jaccard positive = 0.9695626620283051
* >>>> FOLD Jaccard negative = 0.9678254485028062

* >>>> FOLD Jaccard all = 0.9796338924844558
* >>>> FOLD Jaccard neutral = 0.996154667866922
* >>>> FOLD Jaccard positive = 0.969079424824062
* >>>> FOLD Jaccard negative = 0.967668908646805

# CV inference

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv').fillna('')
text = train['text'].values
selected_text = train['selected_text'].values.copy()
sentiments = train['sentiment'].values

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    if (len(a)==0) & (len(b)==0): return 0.5
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


all = []
list_st = []
all_nn = []
all_p = []
all_n = []

DIR = '../input/tweet-cv-ens-0609/'

oof_chr_start = np.load(DIR + 'oof_chr_start.npy')
oof_chr_end = np.load(DIR + 'oof_chr_end.npy')


from collections import namedtuple
Row = namedtuple('Row', ['original_text', 'normalized_text', 'sentiment', 'y_start_char', 'y_end_char'])
for k in tqdm(list(range(train.shape[0]))):
    text0 = text[k]
    text1 = " " + " ".join(text[k].split())
    enc = tokenizer.encode(text1)

    start_prob = oof_chr_start[k]
    end_prob = oof_chr_end[k] + np.arange(141) * 1.0e-15

    y_start_char = start_prob.argmax()
    end_prob[:y_start_char] = 0
    y_end_char = end_prob.argmax() + 1
            

    ss = y_start_char
    ee = y_end_char
    st = text1[ss:ee].strip()

        
    row = Row(
        original_text=text0,
        normalized_text=text1,
        sentiment=sentiments[k],
        y_start_char=ss,
        y_end_char=ee,
    )
    try:
        st = postprocess(row)
    except:
        print(k)

    list_st.append(st)
    sc = jaccard(st,selected_text[k])
    if sentiments[k] == 'neutral':
        all_nn.append(sc)
    elif sentiments[k] == 'positive':
        all_p.append(sc)
    else:
        all_n.append(sc)

    all.append(sc)
print('>>>> FOLD Jaccard all =',np.mean(all))#, np.mean(all_nn), np.mean(all_p), np.mean(all_n))
print('>>>> FOLD Jaccard neutral =',np.mean(all_nn))
print('>>>> FOLD Jaccard positive =',np.mean(all_p))
print('>>>> FOLD Jaccard negative =',np.mean(all_n))