<a href="https://colab.research.google.com/github/sangeetsaurabh/tweet_phrase_kaggle_competition/blob/master/pytorch_transformer/tweet_sentiment_xlnet_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Xlnet Transformer implementation to predict selected text

#### install transformers

In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import XLMRobertaModel, XLMRobertaConfig, XLMRobertaTokenizer
from transformers import XLNetModel,XLNetConfig,XLNetTokenizer

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

# Seed

In [3]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

### Set up the path

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
data_folder = "/content/drive/My Drive/tweet-sentiment-extraction/data"
xlnet_folder = "/content/drive/My Drive/tweet-sentiment-extraction/xlnet/"
tmp_folder = '/tmp'

##### install required libraries

Install Google sentence piece tokenizer and required files.

Given a tweet, we are trying to select subset of text that describes the tweet the best. In order to make the prediction accurately, offsets of each token are required. Sentencepiece tokenizer is being used here as it's easy to get offsets for each token using that tokenizer.

In [6]:
!pip install protobuf
!wget https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py
!pip install sentencepiece

--2020-07-02 18:29:01--  https://raw.githubusercontent.com/google/sentencepiece/master/python/sentencepiece_pb2.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7382 (7.2K) [text/plain]
Saving to: ‘sentencepiece_pb2.py.3’


2020-07-02 18:29:01 (73.8 MB/s) - ‘sentencepiece_pb2.py.3’ saved [7382/7382]



In [7]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model

--2020-07-02 18:29:04--  https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.12.22
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.12.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 798011 (779K) [binary/octet-stream]
Saving to: ‘xlnet-base-cased-spiece.model.3’


2020-07-02 18:29:04 (2.46 MB/s) - ‘xlnet-base-cased-spiece.model.3’ saved [798011/798011]



# Data Loader

#### Defining Sentencepiece tokenizer

In [8]:
import sentencepiece_pb2
import sentencepiece as spm
class SentencePieceTokenizer:
    def __init__(self):
        self.sp = spm.SentencePieceProcessor()
        #self.sp.load(os.path.join(model_path, "spiece.model"))
        self.sp.load("xlnet-base-cased-spiece.model")
    
    def encode(self, sentence):
        spt = sentencepiece_pb2.SentencePieceText()
        spt.ParseFromString(self.sp.encode_as_serialized_proto(sentence))
        offsets = []
        tokens = []
        for piece in spt.pieces:
            tokens.append(piece.id)
            offsets.append((piece.begin, piece.end))
        return tokens, offsets

In [9]:
tokenizer1 = SentencePieceTokenizer()
tokenizer1.sp.pad_id()

5

In [10]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=100):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer1 = SentencePieceTokenizer()
        ### using xlnet inbuilt tokenizer as well as it returns some of the data in the right format to feed into xlnet transformer
        self.tokenizer2 = XLNetTokenizer("xlnet-base-cased-spiece.model", do_lower_case=True)
        

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, token_type_ids, tweet, offsets,row_id = self.get_input_data(row)

        data['ids'] = ids
        data['masks'] = masks
        data['token_type_ids'] = token_type_ids
        data['tweet'] = tweet
        data['offsets'] = offsets
        data['row_id'] = row_id
        
        if self.labeled:
            #print("I came here")
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())
        encoding_ids, encoding_offsets = self.tokenizer1.encode(tweet)
        sentiment_id, _ = self.tokenizer1.encode(row.sentiment)
        ids = sentiment_id + [4] + encoding_ids + [4,3]
        offsets = [(0, 0)] * 2 + encoding_offsets + [(0, 0)]*2
        token_type_ids = self.tokenizer2.create_token_type_ids_from_sequences(sentiment_id,encoding_ids)
                
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids = [5] * pad_len + ids
            offsets = [(0, 0)] * pad_len + offsets
            token_type_ids = [3]*pad_len + token_type_ids
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 5, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        token_type_ids = torch.tensor(token_type_ids)
        
        return ids, masks, token_type_ids, tweet, offsets, row.idx
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())
        #print(selected_text)
        #print(row.idx)

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

In [11]:
config = XLNetConfig.from_pretrained(
            'xlnet-base-cased', output_hidden_states=True)    
roberta = XLNetModel.from_pretrained(
            'xlnet-base-cased', config=config)

In [12]:
config.to_json_file(tmp_folder + '/xlnet_config.json')

In [13]:
roberta.save_pretrained(tmp_folder)

# Model

In [14]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        config = XLNetConfig.from_pretrained(
            tmp_folder + '/xlnet_config.json', output_hidden_states=True)    
        self.roberta = XLNetModel.from_pretrained(
            tmp_folder + '/pytorch_model.bin', config=config)
        #config = RobertaConfig.from_pretrained(
        #    'roberta-large', output_hidden_states=True)    
        #self.roberta = RobertaModel.from_pretrained(
        #    'roberta-large', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask,token_type_ids):
        #_, _, hs = self.roberta(input_ids, attention_mask,token_type_ids=token_type_ids)
        _,hs = self.roberta(input_ids, attention_mask,token_type_ids=token_type_ids)
        #print(len(output))
        #print(output[0].shape)
        #for i in range(len(output[1])):
        #  print(output[1][i].shape)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

# Loss Function

In [15]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

# Evaluation Function

In [16]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

# Training Function

In [17]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename):
    model.cuda()

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            for data in (dataloaders_dict[phase]):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                token_type_ids = data['token_type_ids'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    start_logits, end_logits = model(ids, masks,token_type_ids)

                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
                    
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
    
    torch.save(model.state_dict(), filename)

# Training

In [18]:
num_epochs = 3
batch_size = 32
#skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
skf = StratifiedKFold(n_splits=10, shuffle=True)

In [19]:
train_df = pd.read_csv(data_folder + '/train.csv')
train_df = train_df[train_df.text == train_df.text]
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)
train_df['idx'] = train_df.index.copy()

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    print(f'Fold: {fold}')

    model = TweetModel()
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999))
    criterion = loss_fn    
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)
    #print(dataloaders_dict)


    train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs,
        xlnet_folder + f'roberta_fold{fold}.pth')

Fold: 1
Epoch 1/3 | train | Loss: 2.5765 | Jaccard: 0.6337
Epoch 1/3 |  val  | Loss: 1.8659 | Jaccard: 0.6791
Epoch 2/3 | train | Loss: 1.8052 | Jaccard: 0.6976
Epoch 2/3 |  val  | Loss: 1.7979 | Jaccard: 0.6906
Epoch 3/3 | train | Loss: 1.6300 | Jaccard: 0.7175
Epoch 3/3 |  val  | Loss: 1.7506 | Jaccard: 0.7133
Fold: 2
Epoch 1/3 | train | Loss: 2.4949 | Jaccard: 0.6409
Epoch 1/3 |  val  | Loss: 1.7959 | Jaccard: 0.7002
Epoch 2/3 | train | Loss: 1.7712 | Jaccard: 0.7033
Epoch 2/3 |  val  | Loss: 1.7530 | Jaccard: 0.7039
Epoch 3/3 | train | Loss: 1.6019 | Jaccard: 0.7215
Epoch 3/3 |  val  | Loss: 1.7466 | Jaccard: 0.7091
Fold: 3
Epoch 1/3 | train | Loss: 2.4704 | Jaccard: 0.6427
Epoch 1/3 |  val  | Loss: 1.8486 | Jaccard: 0.6967
Epoch 2/3 | train | Loss: 1.7641 | Jaccard: 0.7030
Epoch 2/3 |  val  | Loss: 1.7434 | Jaccard: 0.7089
Epoch 3/3 | train | Loss: 1.5989 | Jaccard: 0.7219
Epoch 3/3 |  val  | Loss: 1.7040 | Jaccard: 0.7100
Fold: 4
Epoch 1/3 | train | Loss: 2.7005 | Jaccard: 0.6331

In [20]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,idx
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,1
2,088c60f138,my boss is bullying me...,bullying me,negative,2
3,9642c003ef,what interview! leave me alone,leave me alone,negative,3
4,358bd9e861,"Sons of ****, why couldn`t they put them on the releases we already bought","Sons of ****,",negative,4


# Inference

In [21]:
%%time

test_df = pd.read_csv(data_folder + '/test.csv')
test_df["idx"] = test_df.index.copy()
test_df['text'] = test_df['text'].astype(str)
test_loader = get_test_loader(test_df)
predictions = []
models = []
for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(xlnet_folder + f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    token_type_ids = data['token_type_ids'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks,token_type_ids)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

CPU times: user 2min 37s, sys: 1min 3s, total: 3min 41s
Wall time: 3min 23s


# Submission

In [22]:
sub_df = pd.read_csv(data_folder + '/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,exciting
2,eee518ae67,such a shame!
3,01082688c6,happy
4,33987a8ee5,i like it!!


In [25]:
sub_df.shape

(3534, 2)

#### Verification
Look at a few selected text to see how they look

In [28]:
pd.merge(test_df,sub_df,how='left',on='textID')

Unnamed: 0,textID,text,sentiment,idx,selected_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,0,last session of the day
1,96d74cb729,Shanghai is also really exciting (precisely -- skyscrapers galore). Good tweeps in China: (SH) (BJ).,positive,1,exciting
2,eee518ae67,"Recession hit Veronique Branquinho, she has to quit her company, such a shame!",negative,2,such a shame!
3,01082688c6,happy bday!,positive,3,happy
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,4,i like it!!
...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep but i try it",negative,3529,tired
3530,416863ce47,"All alone in this old house again. Thanks for the net which keeps me alive and kicking! Whoever invented the net, i wanna kiss your hair!",positive,3530,thanks
3531,6332da480c,I know what you mean. My little dog is sinking into depression... he wants to move someplace tropical,negative,3531,depression.
3532,df1baec676,_sutra what is your next youtube video gonna be about? I love your videos!,positive,3532,i love


Hmmm - they look good.