In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
from matplotlib import pyplot as plt 
from transformers import RobertaModel,RobertaTokenizer,RobertaConfig,AdamW,get_linear_schedule_with_warmup
from tokenizers import ByteLevelBPETokenizer
import torch 
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch import nn,optim 
import os 
import time 
import random 
from collections import defaultdict
from tqdm import tqdm
import re
import math
import pickle
from sklearn.model_selection import StratifiedKFold
import gc 
gc.enable()

# Load data

In [None]:
train=pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test=pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
submission=pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
train['length']=train.text.astype(str).apply(lambda x:len(x))
train['num_words']=train.text.astype(str).apply(lambda x:len(x.split()))

In [None]:
train.describe()

In [None]:
train

# Preprocess Data

In [None]:
train.isnull().sum()

We have 1 row which has nan value

In [None]:
train=train.dropna()
train=train.reset_index(drop=True)

In [None]:
train

# Data Loader

In [None]:
class Config:
    MAX_LENGTH=256
    BATCH_SIZE=32
    EPOCHS=5
    TOKENIZER=ByteLevelBPETokenizer(
      vocab='../input/roberta-base/vocab.json',
      merges='../input/roberta-base/merges.txt',
      add_prefix_space=True,
      lowercase=True
    )
    TOKENIZER.enable_truncation(max_length=MAX_LENGTH)
    VOCAB=TOKENIZER.get_vocab()
    INT_TO_WORD={value:key for key,value in VOCAB.items()}
    CLS_ID=VOCAB['<s>']
    SEP_ID=VOCAB['</s>']
    PAD_ID=VOCAB['<pad>']

In [None]:
temp=Config.TOKENIZER.encode(' hello, my name is khanh')

In [None]:
temp.offsets

In [None]:
def get_data(tweet,sentiment,config):
    tweet=" "+" ".join(tweet.lower().split())
    token=config.TOKENIZER.encode(tweet)
    sentiment_values={
      value:config.VOCAB[value] for value in ['positive','negative','neutral']
    }
    input_ids=[config.CLS_ID]+[sentiment_values[sentiment]]+[config.SEP_ID]+token.ids+[config.SEP_ID]
    attention_mask=[1]*len(input_ids)
    offsets=[(0,0)]*3+token.offsets+[(0,0)]
    #padding_len=Config.MAX_LENGTH-len(input_ids)
    # if padding_len>0:
    #     input_ids=input_ids+[Config.PAD_ID]*padding_len
    #     attention_mask=attention_mask+[0]*padding_len
    #     offsets=offsets+[(0,0)]*padding_len
    return tweet,input_ids,attention_mask,offsets


In [None]:
def find_index(tweet,selected_text,offsets,config):
    selected_text=" "+" ".join(selected_text.lower().split())
    index1,index2=None,None
    length=len(selected_text)-1
    for value in [position for position,value in enumerate(tweet) if value==selected_text[1]]:
        if " "+tweet[value:value+length]==selected_text:
            index1=value
            index2=value+length-1
    temp=[0]*len(tweet)
    start_index,end_index=None,None
    #print(len(temp),'--->',index1,'---->',index2)
    if index1!=None and index2!=None:
        #print(index1,'-->',index2)
        for i in range(index1,index2+1):
            temp[i]=1
        list_index=[]
        for i,(offset1,offset2) in enumerate(offsets):
            if sum(temp[offset1:offset2])>0:
                list_index.append(i)
        start_index=list_index[0]
        end_index=list_index[-1]
    return start_index,end_index


In [None]:
start_indexs,end_indexs=[],[]
for i in range(len(train)):
    tweet_temp=train['text'][i]
    sentiment=train['sentiment'][i]
    selected_temp=train['selected_text'][i]
    tweet_temp,_,_,offsets=get_data(tweet_temp,sentiment,Config)
    start_index,end_index=find_index(tweet_temp,selected_temp,offsets,Config)
    if start_index is None:
        print(tweet_temp,'------>',selected_temp)
    start_indexs.append(start_index)
    end_indexs.append(end_index)

train['start_index']=start_indexs
train['end_index']=end_indexs

In [None]:
train

In [None]:
class TweetSentimentExtraction(Dataset):
    def __init__(self,data,config):
        self.data=data
        self.tokenizer=config.TOKENIZER
        self.max_length=config.MAX_LENGTH
        self.is_test="selected_text" in self.data
        self.config=config
        self.cls_id=self.config.VOCAB['<s>']
        self.sep_id=self.config.VOCAB['</s>']
        self.pad_id=self.config.VOCAB['<pad>']


    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        row=self.data.iloc[idx]
        tweet=row.text
        sentiment=row.sentiment
        tweet,input_ids,attention_mask,offsets=get_data(tweet,sentiment,self.config)
        data={}
        data['input_ids']=torch.tensor(input_ids,dtype=torch.long)
        data['attention_mask']=torch.tensor(attention_mask,dtype=torch.long)
        data['tweet']=tweet
        data['offsets']=offsets
        if self.is_test:
            start_index=row.start_index
            end_index=row.end_index
            data['start_index']=start_index
            data['end_index']=end_index
        return data
    


class MyCollate:
    def __init__(self,pad_id,is_test=False):
        self.pad_id=pad_id
        self.is_test=is_test
        
    def __call__(self,batch):
        input_ids=[item['input_ids'] for item in batch]
        attention_mask=[item['attention_mask'] for item in batch]
        tweet=[item['tweet'] for item in batch]
        offsets=[item['offsets'] for item in batch]
        input_ids=pad_sequence(input_ids,batch_first=True,padding_value=self.pad_id)
        attention_mask=pad_sequence(attention_mask,batch_first=True,padding_value=0)
        if len(offsets)<input_ids.size(1):
            padding_len=input_ids.size(1)-len(offsets)
            offsets=offsets+[(0,0)]*padding_len

        if not self.is_test:
            #print(start_index)
            start_index=[item['start_index'] for item in batch]
            end_index=[item['end_index'] for item in batch]
            start_index=torch.tensor(start_index,dtype=torch.long)
            end_index=torch.tensor(end_index,dtype=torch.long)
            return {
                "input_ids":input_ids,
                "attention_mask":attention_mask,
                "offsets":offsets,
                'tweet':tweet,
                "start_index":start_index,
                "end_index":end_index
            }
        else:
            return {
                "input_ids":input_ids,
                "attention_mask":attention_mask,
                "offsets":offsets,
                'tweet':tweet
            }
        

In [None]:
def get_train_val_loader(df,train_index,val_index):
    train=df.iloc[train_index]
    val=df.iloc[val_index]
    train_dataset=TweetSentimentExtraction(train,Config)
    val_dataset=TweetSentimentExtraction(val,Config)
    train_loader=DataLoader(train_dataset,batch_size=Config.BATCH_SIZE,shuffle=False,num_workers=2,collate_fn=MyCollate(Config.PAD_ID))
    val_loader=DataLoader(val_dataset,batch_size=Config.BATCH_SIZE,shuffle=False,num_workers=2,collate_fn=MyCollate(Config.PAD_ID))
    return train_loader,val_loader

# Model

In [None]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model,self).__init__()
        self.config=RobertaConfig.from_pretrained('../input/roberta-base/config.json',output_hidden_states=True)
        self.bert=RobertaModel.from_pretrained('../input/roberta-base',config=self.config)
        self.hidden_size=self.bert.config.hidden_size
        self.norm=nn.LayerNorm(self.hidden_size)
        self.linear=nn.Sequential(
            nn.Linear(self.hidden_size,self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.ReLU(),
            nn.Linear(self.hidden_size,2)
        )
    
    def __init__weight(self,module):
        if isinstance(module,nn.Linear):
            module.weight.data.normal_(mean=0,std=seld.bert.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module,nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self,input_ids,attention_mask,token_type_ids=None):
        outputs=self.bert(input_ids,attention_mask)
        hidden_states=outputs.hidden_states#batch_size*seq_length*hidden_size
        out=torch.stack([hidden_states[-1],hidden_states[-2],hidden_states[-3],hidden_states[-4]])
        out=torch.mean(out,0)
        out=self.linear(out)
        start_logit,end_logit=torch.split(out,1,-1)
        start_logit=start_logit.squeeze(dim=-1)
        end_logit=end_logit.squeeze(dim=-1)
        return start_logit,end_logit
        

In [None]:
def get_selected_text(text,start_index,end_index,offsets):
    selected_text=""
    for i in range(start_index,end_index+1):
        selected_text+=text[offsets[i][0]:offsets[i][1]]
#         if (i+1)<len(offsets) and offsets[i][0]<offsets[i+1][0]:
#             selected_text+=" "
    return selected_text

In [None]:
def get_loss(start_targets,end_targets,start_logit,end_logit):
    loss_fn=nn.CrossEntropyLoss(reduction="mean")
    loss1=loss_fn(start_logit,start_targets)
    loss2=loss_fn(end_logit,end_targets)
    loss=loss1+loss2
    return loss

def jaccard_score(str1,str2):
    a=set(str1.lower().split())
    b=set(str2.lower().split())
    c=a.intersection(b)
    return float(len(c))/(len(a)+len(b)-len(c))

def computer_jaccard_score(text,start_index,end_index,start_logit,end_logit,offsets):
    start_pred=np.argmax(start_logit)
    end_pred=np.argmax(end_logit)
    if start_pred>end_pred:
        pred=text
    else:
        pred=get_selected_text(text,start_pred,end_pred,offsets)
    true=get_selected_text(text,start_index,end_index,offsets)
    return jaccard_score(true,pred)

def save_checkpoint(model_state_dict,fold):
    #path="/content/drive/MyDrive/Model/TweetSentimentExtraction"
#     if os.path.exists(path) is False:
#         os.makedirs(path,exist_ok=True)
        
    # with open(path+f'/history_{epoch}_fold{fold}.pickle','wb') as file:
    #     pickle.dump(history,file,protocol=pickle.HIGHEST_PROTOCOL)
    # print("Save history done")
    torch.save(model_state_dict,f"/model_fold{fold}.pth")
    print("Save model done")

In [None]:
def evaluate(model,loader,len_val):
    print('---------------------------TIME FOR EVALUATE---------------------------')
    model.eval()
    val_loss=0
    score=0
    with torch.no_grad():
        for idx,data in enumerate(loader):
            input_ids=data['input_ids'].to(device)
            attention_mask=data['attention_mask'].to(device)
            offsets=data['offsets']
            tweets=data['tweet']
            start_logit,end_logit=model(input_ids,attention_mask)
            loss=get_loss(
                data['start_index'].to(device),
                data['end_index'].to(device),
                start_logit,
                end_logit
            )
            val_loss+=loss.item()
            start_indexs=data['start_index'].cpu().detach().numpy()
            end_indexs=data['end_index'].cpu().detach().numpy()
            start_logit=start_logit.cpu().detach().numpy()
            end_logit=end_logit.cpu().detach().numpy()
            for i in range(len(input_ids)):
                score+=computer_jaccard_score(
                    tweets[i],
                    start_indexs[i],
                    end_indexs[i],
                    start_logit[i],
                    end_logit[i],
                    offsets[i]
                )
    return val_loss/len(loader),score/len_val

In [None]:
def train_model(model,train_loader,val_loader,optimizer,scheduler,fold,len_val):
    model.train()
    train_loss=0
    history=defaultdict(list)
    jaccard_score_final=None
    print(f"----------------------------------FOLD {fold}----------------------------------\n\n")
    for epoch in range(Config.EPOCHS+1):
        train_loss=0
        start_time=time.time()
        print('---------------------------TIME FOR TRANING---------------------------')
        for idx,data in enumerate(train_loader):
            input_ids=data['input_ids'].to(device)
            attention_mask=data['attention_mask'].to(device)
            start_logit,end_logit=model(input_ids,attention_mask)
            optimizer.zero_grad()
            loss=get_loss(
                data['start_index'].to(device),
                data['end_index'].to(device),
                start_logit,
                end_logit
            )
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss+=loss.item()
            if idx%100==0:
                print(idx,end=" ")
        print()
        train_loss/=len(train_loader)
        val_loss,score=evaluate(model,val_loader,len_val)
        history['train_loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['jaccard_score'].append(score)
        print(f"Epochs:{epoch}---Train loss:{train_loss}---Val loss:{val_loss}---Jaccard score val:{score}---Time:{time.time()-start_time}")
        if jaccard_score_final is None or score>jaccard_score_final:
            model_state=model.state_dict()
            jaccard_score_final=score
    
    save_checkpoint(model_state,fold)             
    print('\n\n')


In [None]:
# kfold=StratifiedKFold(n_splits=5,shuffle=True,random_state=43)

# for fold,(train_indexs,val_indexs) in enumerate(kfold.split(train,train.sentiment),start=1):
#     model=Model().to(device)
#     optimizer=AdamW(model.parameters(),lr=1e-5)
#     scheduler=get_linear_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=50,
#         num_training_steps=len(train)//Config.BATCH_SIZE*Config.EPOCHS    
#     )

#     train_loader,val_loader=get_train_val_loader(train,train_indexs,val_indexs)
#     train_model(model,train_loader,val_loader,optimizer,scheduler,fold,len(val_indexs))
#     del model,optimizer,scheduler
#     torch.cuda.empty_cache()
#     gc.collect()
  

In [None]:
test_dataset=TweetSentimentExtraction(test,Config)
test_loader=DataLoader(
                        test_dataset,batch_size=Config.BATCH_SIZE,
                        shuffle=False,num_workers=2,
                        collate_fn=MyCollate(Config.PAD_ID,is_test=True)
                    )

In [None]:
models=[]
for file_model in os.listdir('../input/tweetsentimentextractionroberta'):
    model_temp=Model().to(device)
    model_temp.load_state_dict(torch.load('../input/tweetsentimentextractionroberta/'+file_model))
    model_temp.eval()
    models.append(model_temp)

In [None]:
def get_predict(input_ids,attention_mask,size_batch):
    start_preds=torch.tensor([0]*size_batch,dtype=torch.long).to(device)
    end_preds=torch.tensor([0]*size_batch,dtype=torch.long).to(device)
    for i in range(len(models)):
        model=models[i]
        start_logit,end_logit=model(input_ids,attention_mask)
        start_index=torch.argmax(start_logit,dim=1)
        end_index=torch.argmax(end_logit,dim=1)
        start_preds+=start_index
        end_preds+=end_index
    start_preds=start_preds.cpu().detach().numpy()
    end_preds=end_preds.cpu().detach().numpy()
    return (start_preds/len(models)).astype(int),(end_preds/len(models)).astype(int)
        
        

In [None]:
preds=[]
for data in test_loader:
    input_ids=data['input_ids'].to(device)
    attention_mask=data['attention_mask'].to(device)
    offsets=data['offsets']
    tweets=data['tweet']
    start_indexs,end_indexs=get_predict(input_ids,attention_mask,len(tweets))
    for i in range(len(tweets)):
        if start_indexs[i]>end_indexs[i]:
            result=tweets[i]
        else:
            result=get_selected_text(tweets[i],start_indexs[i],end_indexs[i],offsets[i])
        preds.append(result)

In [None]:
# preds=[]
# for data in test_loader:
#     input_ids=data['input_ids'].to(device)
#     attention_mask=data['attention_mask'].to(device)
#     tweet=data['tweet']
#     offsets=data['offsets']
#     start_logits=[]
#     end_logits=[]
#     for model in models:
#         with torch.no_grad():
#             outputs=model(input_ids,attention_mask)
#             start_logits.append(torch.softmax(outputs[0],dim=1).cpu().detach().numpy())
#             end_logits.append(torch.softmax(outputs[1],dim=1).cpu().detach().numpy())
#     start_logits=np.mean(start_logits,axis=0)
#     end_logits=np.mean(end_logits,axis=0)
#     start_indexs=np.argmax(start_logits,axis=1)
#     end_indexs=np.argmax(end_logits,axis=1)
#     for i in range(len(tweet)):
#         start_pred=start_indexs[i]
#         end_pred=end_indexs[i]
#         if start_pred>end_pred:
#             result=tweet[i]
#         else:
#             result=get_selected_text(tweet[i],start_pred,end_pred,offsets[i])
#         preds.append(result)

In [None]:
def replace_string(x):
    x=re.sub('!+','!',x)
    x=re.sub('.+','.',x)
    return x

In [None]:
submission.selected_text=preds
# submission['selected_text'] = submission['selected_text'].apply(lambda x: replace_string(x))
submission.to_csv('submission.csv',index=False)
submission.head(10)