In [None]:
%%capture
!pip install simpletransformers
!pip install tokenizers
!pip install --upgrade transformers
!git clone https://github.com/NVIDIA/apex
!pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex
%env JOBLIB_TEMP_FOLDER=/tmp

In [None]:
import re
import torch
import logging
import pandas as pd
from tokenizers import BertWordPieceTokenizer 
from tqdm.auto import tqdm
from simpletransformers.ner import NERModel

In [None]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
tokenizer = BertWordPieceTokenizer("../input/berthub/assets/vocab.txt", lowercase=True)

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def ner_tagger(sent_index,row_value,tokenizer):
    
    tagged_words = []
    
    text = str(row_value[0])
    selected_text = str(row_value[1])
    sentiment = str(row_value[2].upper())
    
    selected_text_list = tokenizer.encode(selected_text).tokens[1:-1]
    other_text_list = tokenizer.encode(text.replace(selected_text,"")).tokens[1:-1]
    
    for i,word in enumerate(selected_text_list):
        if i == 0:
            tag = "B-SENT"
        else:
            tag = "I-SENT"
        tagged_words.append([sent_index,word,tag])
        
    for word in other_text_list:
        tag = "O"
        tagged_words.append([sent_index,word,tag])
    
    return tagged_words

In [None]:
tagged_words = []

for index, row in tqdm(train.iterrows()):
    tagged_words.extend(ner_tagger(index,row[["text","selected_text","sentiment"]].values,tokenizer))

In [None]:
tagged_words

In [None]:
dataset = pd.DataFrame(tagged_words,columns=["sentence_id","words","labels"])

In [None]:
from sklearn.model_selection import train_test_split

train_df, eval_df = train_test_split(dataset,train_size =0.8, random_state = 42)

In [None]:
train_df.labels.hist()
eval_df.labels.hist()

In [None]:
train

In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

NER_labels = ["O", "B-SENT", "I-SENT"]

if torch.cuda.is_available():
    model = NERModel('bert', 'bert-base-cased', labels = NER_labels, args={'overwrite_output_dir': True, 'reprocess_input_data': True, 'fp16': True})
else:
    model = NERModel('bert', 'bert-base-cased', labels = NER_labels, use_cuda = False, args={'overwrite_output_dir': True, 'reprocess_input_data': True, 'fp16': False})

# Train the model
model.train_model(train_df)

# Evaluate the model
#result, model_outputs, predictions = model.eval_model(eval_df)

# Predictions on arbitary text strings
#predictions, raw_outputs = model.predict(["Some arbitary sentence"])

#print(predictions)

In [None]:
%env JOBLIB_TEMP_FOLDER=/tmp
result, model_outputs, predictions = model.eval_model(eval_df)

In [None]:
result

In [None]:
result