In [1]:
import numpy as np
import pandas as pd
import os
import tokenizers
import string
import torch
import transformers
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import re

In [2]:
import random
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(323)

In [3]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 8
EPOCHS = 5
ROBERTA_PATH = "../input/roberta-base"
TOKENIZER = tokenizers.ByteLevelBPETokenizer(
    vocab_file=f"{ROBERTA_PATH}/vocab.json", 
    merges_file=f"{ROBERTA_PATH}/merges.txt", 
    lowercase=True,
    add_prefix_space=True
)

In [4]:
class TweetModel(transformers.BertPreTrainedModel):
    def __init__(self, conf):
        super(TweetModel, self).__init__(conf)
        self.roberta = transformers.RobertaModel.from_pretrained(ROBERTA_PATH, config=conf)
        self.drop_out = nn.Dropout(0.3)
        self.l0 = nn.Linear(768 * 2, 2)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
    
    def forward(self, ids, mask, token_type_ids):
        _, _, out = self.roberta(
            ids,
            attention_mask=mask,
            token_type_ids=token_type_ids
        )

        out = torch.cat((out[-1], out[-2]), dim=-1)
        logits1 = self.l0(self.drop_out(out))
        logits2 = self.l0(self.drop_out(out))
        logits3 = self.l0(self.drop_out(out))
        logits4 = self.l0(self.drop_out(out))
        logits5 = self.l0(self.drop_out(out))
        logits = torch.mean(torch.stack([logits1, logits2, logits3, logits4, logits5]), dim = 0)
        start_logits, end_logits = logits.split(1, dim=-1)

        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        return start_logits, end_logits

In [5]:
def process_data(tweet, selected_text, sentiment, tokenizer, max_len):
    tweet = " " + " ".join(str(tweet).split())
    selected_text = " " + " ".join(str(selected_text).split())

    len_st = len(selected_text) - 1
    idx0 = None
    idx1 = None

    for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
        if " " + tweet[ind: ind+len_st] == selected_text:
            idx0 = ind
            idx1 = ind + len_st - 1
            break

    char_targets = [0] * len(tweet)
    if idx0 != None and idx1 != None:
        for ct in range(idx0, idx1 + 1):
            char_targets[ct] = 1
    
    tok_tweet = tokenizer.encode(tweet)
    input_ids_orig = tok_tweet.ids
    tweet_offsets = tok_tweet.offsets
    
    target_idx = []
    for j, (offset1, offset2) in enumerate(tweet_offsets):
        if sum(char_targets[offset1: offset2]) > 0:
            target_idx.append(j)
    
    targets_start = target_idx[0]
    targets_end = target_idx[-1]

    sentiment_id = {
        'positive': 1313,
        'negative': 2430,
        'neutral': 7974
    }
    
    input_ids = [0] + [sentiment_id[sentiment]] + [2] + [2] + input_ids_orig + [2]
    token_type_ids = [0, 0, 0, 0] + [0] * (len(input_ids_orig) + 1)
    mask = [1] * len(token_type_ids)
    tweet_offsets = [(0, 0)] * 4 + tweet_offsets + [(0, 0)]
    targets_start += 4
    targets_end += 4

    padding_length = max_len - len(input_ids)
    if padding_length > 0:
        input_ids = input_ids + ([1] * padding_length)
        mask = mask + ([0] * padding_length)
        token_type_ids = token_type_ids + ([0] * padding_length)
        tweet_offsets = tweet_offsets + ([(0, 0)] * padding_length)
    
    return {
        'ids': input_ids,
        'mask': mask,
        'token_type_ids': token_type_ids,
        'targets_start': targets_start,
        'targets_end': targets_end,
        'orig_tweet': tweet,
        'orig_selected': selected_text,
        'sentiment': sentiment,
        'offsets': tweet_offsets
    }

class TweetDataset:
    def __init__(self, tweet, sentiment, selected_text):
        self.tweet = tweet
        self.sentiment = sentiment
        self.selected_text = selected_text
        self.tokenizer = TOKENIZER
        self.max_len = MAX_LEN
    
    def __len__(self):
        return len(self.tweet)

    def __getitem__(self, item):
        data = process_data(
            self.tweet[item], 
            self.selected_text[item], 
            self.sentiment[item],
            self.tokenizer,
            self.max_len
        )

        return {
            'ids': torch.tensor(data["ids"], dtype=torch.long),
            'mask': torch.tensor(data["mask"], dtype=torch.long),
            'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
            'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
            'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
            'orig_tweet': data["orig_tweet"],
            'orig_selected': data["orig_selected"],
            'sentiment': data["sentiment"],
            'offsets': torch.tensor(data["offsets"], dtype=torch.long)
        }

In [6]:
def calculate_jaccard_score(
    original_tweet, 
    target_string, 
    sentiment_val, 
    idx_start, 
    idx_end, 
    offsets,
    verbose=False):
    
    
    
    filtered_output  = ""
    
    if idx_end < idx_start:
        filtered_output = original_tweet
        jac = 0
        return jac, filtered_output
        
    for ix in range(idx_start, idx_end + 1):
        filtered_output += original_tweet[offsets[ix][0]: offsets[ix][1]]
        if (ix+1) < len(offsets) and offsets[ix][1] < offsets[ix+1][0]:
            filtered_output += " "

#     if sentiment_val == "neutral" or len(original_tweet.split()) < 3:
#         filtered_output = original_tweet

   # if sentiment_val != "neutral" and verbose == True:
    if verbose == True:
        if filtered_output.strip().lower() != target_string.strip().lower():
            print("********************************")
            print(f"Output= {filtered_output.strip()}")
            print(f"Target= {target_string.strip()}")
            print(f"Tweet= {original_tweet.strip()}")
            print("********************************")

    jac = 0
    return jac, filtered_output

In [7]:
df_test = pd.read_csv("../input/tweet-sentiment-extraction/test.csv")
df_test.loc[:, "selected_text"] = df_test.text.values

In [8]:
device = torch.device("cuda")
model_config = transformers.RobertaConfig.from_pretrained(ROBERTA_PATH)
model_config.output_hidden_states = True

In [9]:
#tweet-version5-model  tweet-roberta-train-two
model1 = TweetModel(conf=model_config)
model1.to(device)
model1.load_state_dict(torch.load("../input/tweet-roberta-train-two/model_0.bin"))
model1.eval()

model2 = TweetModel(conf=model_config)
model2.to(device)
model2.load_state_dict(torch.load("../input/tweet-roberta-train-two/model_1.bin"))
model2.eval()

model3 = TweetModel(conf=model_config)
model3.to(device)
model3.load_state_dict(torch.load("../input/tweet-roberta-train-two/model_2.bin"))
model3.eval()

model4 = TweetModel(conf=model_config)
model4.to(device)
model4.load_state_dict(torch.load("../input/tweet-roberta-train-two/model_3.bin"))
model4.eval()

model5 = TweetModel(conf=model_config)
model5.to(device)
model5.load_state_dict(torch.load("../input/tweet-roberta-train-two/model_4.bin"))
model5.eval()

TweetModel(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-05, el

In [10]:
final_output = []

In [11]:
test_dataset = TweetDataset(
        tweet=df_test.text.values,
        sentiment=df_test.sentiment.values,
        selected_text=df_test.selected_text.values
    )

data_loader = torch.utils.data.DataLoader(
    test_dataset,
    shuffle=False,
    batch_size=VALID_BATCH_SIZE,
    num_workers=1
)


with torch.no_grad():
    tk0 = tqdm(data_loader, total=len(data_loader))
    for bi, d in enumerate(tk0):
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        sentiment = d["sentiment"]
        orig_selected = d["orig_selected"]
        orig_tweet = d["orig_tweet"]
        targets_start = d["targets_start"]
        targets_end = d["targets_end"]
        offsets = d["offsets"].numpy()

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets_start = targets_start.to(device, dtype=torch.long)
        targets_end = targets_end.to(device, dtype=torch.long)

        outputs_start1, outputs_end1 = model1(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start2, outputs_end2 = model2(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start3, outputs_end3 = model3(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start4, outputs_end4 = model4(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        
        outputs_start5, outputs_end5 = model5(
            ids=ids,
            mask=mask,
            token_type_ids=token_type_ids
        )
        outputs_start = (outputs_start1 + outputs_start2 + outputs_start3 + outputs_start4 + outputs_start5) / 5
        outputs_end = (outputs_end1 + outputs_end2 + outputs_end3 + outputs_end4 + outputs_end5) / 5
        
        outputs_start = torch.softmax(outputs_start, dim=1).cpu().detach().numpy()
        outputs_end = torch.softmax(outputs_end, dim=1).cpu().detach().numpy()
        jaccard_scores = []
        for px, tweet in enumerate(orig_tweet):
            selected_tweet = orig_selected[px]
            tweet_sentiment = sentiment[px]
            _, output_sentence = calculate_jaccard_score(
                original_tweet=tweet,
                target_string=selected_tweet,
                sentiment_val=tweet_sentiment,
                idx_start=np.argmax(outputs_start[px, :]),
                idx_end=np.argmax(outputs_end[px, :]),
                offsets=offsets[px]
            )
            final_output.append(output_sentence)

100%|██████████| 442/442 [01:16<00:00,  5.74it/s]


In [12]:
# from collections import Counter
# train = pd.read_csv("../input/tweet-sentiment-extraction/train.csv")

# train_pn = train.loc[train['sentiment']=='positive']
# xck = train_pn.selected_text.values.tolist()
# c=Counter()
# for i in range(len(xck)):
#     s = xck[i]
#     if len(str(s))>0:
#         #start = str(s).split()[0]
#         end = str(s).split()[-1]
#         #if len(start)>=1:
#        #     c[start] += 1 
#         if len(end)>=1 and end[-1] in string.punctuation:
#             c[end] +=1
            
            
# train_ng = train.loc[train['sentiment']=='negative']
# xck = train_ng.selected_text.values.tolist()
# cb=Counter()
# for i in range(len(xck)):
#     s = xck[i]
#     if len(str(s))>0:
#        # start = str(s).split()[0]
#         end = str(s).split()[-1]
#        # if len(start)>=1:
#         #    cb[start] += 1 
#         if len(end)>=1 and end[-1] in string.punctuation:
#             cb[end] +=1
    

# def pp_post(text, pre, sen):
#     if sen == 'positive':
#         pre_copy = pre
#         text = str(text).split()
#         pre = str(pre).split()
        
#       #  first = pre[0]
#         end = pre[-1]
#         if len(end)>=5 and end[-1] in string.punctuation and end[-2] in string.punctuation and end[-3] in string.punctuation:
#             kk = [c[end], c[end[:-1]], c[end[:-2]], c[end[:-3]]]
#             result = kk.index(max(kk))
#             if result !=0:
#                 pre = " ".join(pre[:-1])+" "+end[:-result]
#             else:
#                 pre = pre_copy
#         else:
#             pre = pre_copy
            
#     if sen == 'negative':
#         pre_copy = pre
#         text = str(text).split()
#         pre = str(pre).split()
        
#       #  first = pre[0]
#         end = pre[-1]
#         if len(end)>=6 and end[-1] in string.punctuation and end[-2] in string.punctuation and end[-3] in string.punctuation:
#             kk = [cb[end], cb[end[:-1]], cb[end[:-2]], cb[end[:-3]]]
#             result = kk.index(max(kk))
#             if result !=0:
#                 pre = " ".join(pre[:-1])+" "+end[:-result]
#             else:
#                 pre = pre_copy
#         else:
#             pre = pre_copy 
      
    
#     return pre


In [13]:
df_test['selected_text'] = final_output
#df_test['selected_text'] = df_test.apply(lambda x:pp_post(x['text'], x['selected_text'], x['sentiment']),axis=1)

In [14]:
#pp
import re

def post_process_negative(text):
    text = text.lower()
    text = re.sub(r'((\.)\2{2,})', '..', text)
    text = re.sub(r'http\S+', '', text)
    #text = text.replace('????', '??')
    return text

def post_process_positive(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    #text = text.replace('????', '??')
    return text

def post_process_neutral(text):
    text = text.lower()
    return text

# Post Process
df_test.loc[df_test['sentiment'] == 'negative', 'selected_text'] = df_test.loc[df_test['sentiment'] == 'negative', 'selected_text'].apply(post_process_negative)
df_test.loc[df_test['sentiment'] == 'positive', 'selected_text'] = df_test.loc[df_test['sentiment'] == 'positive', 'selected_text'].apply(post_process_positive)

df_test[["selected_text"]].to_csv("submission.csv")
print(df_test[["selected_text"]])

                                          selected_text
0      Last session of the day http://twitpic.com/67ezh
1                                              exciting
2                                         such a shame!
3                                                 happy
4                                           i like it!!
...                                                 ...
3529                                              tired
3530                                             thanks
3531         my little dog is sinking into depression..
3532                                i love your videos!
3533                                               cute

[3534 rows x 1 columns]


In [15]:
sample = pd.read_csv("../input/tweet-sentiment-extraction/sample_submission.csv")
sample['selected_text'] = df_test['selected_text'].values.tolist()



In [16]:
sample.to_csv("submission.csv", index=False)

In [17]:
sample.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy
4,33987a8ee5,i like it!!


In [18]:
sample.isnull().sum()

textID           0
selected_text    0
dtype: int64