<a href="https://colab.research.google.com/github/sangeetsaurabh/tweet_phrase_kaggle_competition/blob/master/pytorch_transformer/pytorch_model_with_Electra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Import all the Libraries

In [1]:
!pip install tokenizers

Collecting tokenizers
[?25l  Downloading https://files.pythonhosted.org/packages/6b/15/1c026f3aeafd26db30cb633d9915aae666a415179afa5943263e5dbd55a6/tokenizers-0.8.0-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 2.8MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.8.0


In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/82/25/89050e69ed53c2a3b7f8c67844b3c8339c1192612ba89a172cf85b298948/transformers-3.0.1-py3-none-any.whl (757kB)
[K     |████████████████████████████████| 757kB 2.9MB/s 
[?25hCollecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 15.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 31.6MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl 

In [3]:
import numpy as np
import pandas as pd
import os
import warnings
import random
import torch 
from torch import nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
import tokenizers
from transformers import ElectraConfig, ElectraModel, ElectraTokenizer

warnings.filterwarnings('ignore')

#### Seed

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [5]:
data_folder = "/content/drive/My Drive/tweet-sentiment-extraction/data"
electra_folder = "/content/drive/My Drive/tweet-sentiment-extraction/electra/"
tmp_folder = '/tmp'

#### Data Loader

In [6]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt

--2020-07-05 01:16:02--  https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/vocab.txt
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.38.182
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.38.182|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 231508 (226K) [text/plain]
Saving to: ‘vocab.txt’


2020-07-05 01:16:02 (5.30 MB/s) - ‘vocab.txt’ saved [231508/231508]



In [7]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, df, max_len=128):
        self.df = df
        self.max_len = max_len
        self.labeled = 'selected_text' in df
        self.tokenizer = tokenizers.BertWordPieceTokenizer(
            vocab_file='vocab.txt', 
            lowercase=True)
        self.tokenizer_1 = ElectraTokenizer("vocab.txt",do_lower_case=True)

    def __getitem__(self, index):
        data = {}
        row = self.df.iloc[index]
        
        ids, masks, token_type_ids, tweet, offsets,row_id = self.get_input_data(row)

        data['ids'] = ids
        data['masks'] = masks
        data['token_type_ids'] = token_type_ids
        data['tweet'] = tweet
        data['offsets'] = offsets
        data['row_id'] = row_id
        
        if self.labeled:
            #print("I came here")
            start_idx, end_idx = self.get_target_idx(row, tweet, offsets)
            data['start_idx'] = start_idx
            data['end_idx'] = end_idx
        
        return data

    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        tweet = " " + " ".join(row.text.lower().split())

        encoding = self.tokenizer.encode(row.sentiment,tweet)
        ids = encoding.ids
        offsets = encoding.offsets
        offsets[1] = (0,0)
        token_type_ids = self.tokenizer_1.encode_plus(row.sentiment,tweet,max_length=self.max_len,pad_to_max_length=True,truncation="longest_first")['token_type_ids']
            
        pad_len = self.max_len - len(ids)
        if pad_len > 0:
            ids += [0] * pad_len
            offsets += [(0, 0)] * pad_len
        
        ids = torch.tensor(ids)
        masks = torch.where(ids != 0, torch.tensor(1), torch.tensor(0))
        offsets = torch.tensor(offsets)
        token_type_ids = torch.tensor(token_type_ids)
        
        return ids, masks, token_type_ids, tweet, offsets, row.idx
        
    def get_target_idx(self, row, tweet, offsets):
        selected_text = " " +  " ".join(row.selected_text.lower().split())
        #print(selected_text)
        #print(row.idx)

        len_st = len(selected_text) - 1
        idx0 = None
        idx1 = None

        for ind in (i for i, e in enumerate(tweet) if e == selected_text[1]):
            if " " + tweet[ind: ind+len_st] == selected_text:
                idx0 = ind
                idx1 = ind + len_st - 1
                break

        char_targets = [0] * len(tweet)
        if idx0 != None and idx1 != None:
            for ct in range(idx0, idx1 + 1):
                char_targets[ct] = 1

        target_idx = []
        for j, (offset1, offset2) in enumerate(offsets):
            if sum(char_targets[offset1: offset2]) > 0:
                target_idx.append(j)

        start_idx = target_idx[0]
        end_idx = target_idx[-1]
        
        return start_idx, end_idx
        
def get_train_val_loaders(df, train_idx, val_idx, batch_size=8):
    train_df = df.iloc[train_idx]
    val_df = df.iloc[val_idx]

    train_loader = torch.utils.data.DataLoader(
        TweetDataset(train_df), 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=2,
        drop_last=True)

    val_loader = torch.utils.data.DataLoader(
        TweetDataset(val_df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)

    dataloaders_dict = {"train": train_loader, "val": val_loader}

    return dataloaders_dict

def get_test_loader(df, batch_size=32):
    loader = torch.utils.data.DataLoader(
        TweetDataset(df), 
        batch_size=batch_size, 
        shuffle=False, 
        num_workers=2)    
    return loader

#### Model

In [8]:
!wget https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json

--2020-07-05 01:16:03--  https://s3.amazonaws.com/models.huggingface.co/bert/google/electra-base-generator/config.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.144.29
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.144.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 463 [application/json]
Saving to: ‘config.json’


2020-07-05 01:16:03 (17.7 MB/s) - ‘config.json’ saved [463/463]



In [9]:
config = ElectraConfig.from_pretrained(
            'config.json', output_hidden_states=True)    
roberta = ElectraModel.from_pretrained(
            'google/electra-base-generator', config=config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=135011821.0, style=ProgressStyle(descri…




In [10]:
config.to_json_file("/tmp/electra_config.json")

In [11]:
roberta.save_pretrained(tmp_folder)

In [12]:
class TweetModel(nn.Module):
    def __init__(self):
        super(TweetModel, self).__init__()
        
        config = ElectraConfig.from_pretrained(
            '/tmp/electra_config.json', output_hidden_states=True)    
        self.roberta = ElectraModel.from_pretrained(
            '/tmp/pytorch_model.bin', config=config)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(config.hidden_size, 2)
        nn.init.normal_(self.fc.weight, std=0.02)
        nn.init.normal_(self.fc.bias, 0)

    def forward(self, input_ids, attention_mask,token_type_ids):
        _, hs = self.roberta(input_ids, attention_mask,token_type_ids=token_type_ids)
         
        x = torch.stack([hs[-1], hs[-2], hs[-3], hs[-4]])
        x = torch.mean(x, 0)
        x = self.dropout(x)
        x = self.fc(x)
        start_logits, end_logits = x.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
                
        return start_logits, end_logits

##### Loss function

In [13]:
def loss_fn(start_logits, end_logits, start_positions, end_positions):
    ce_loss = nn.CrossEntropyLoss()
    start_loss = ce_loss(start_logits, start_positions)
    end_loss = ce_loss(end_logits, end_positions)    
    total_loss = start_loss + end_loss
    return total_loss

Evaluation Function

In [14]:
def get_selected_text(text, start_idx, end_idx, offsets):
    selected_text = ""
    for ix in range(start_idx, end_idx + 1):
        selected_text += text[offsets[ix][0]: offsets[ix][1]]
        if (ix + 1) < len(offsets) and offsets[ix][1] < offsets[ix + 1][0]:
            selected_text += " "
    return selected_text

def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_jaccard_score(text, start_idx, end_idx, start_logits, end_logits, offsets):
    start_pred = np.argmax(start_logits)
    end_pred = np.argmax(end_logits)
    if start_pred > end_pred:
        pred = text
    else:
        pred = get_selected_text(text, start_pred, end_pred, offsets)
        
    true = get_selected_text(text, start_idx, end_idx, offsets)
    
    return jaccard(true, pred)

#### Training Function

In [15]:
def train_model(model, dataloaders_dict, criterion, optimizer, num_epochs, filename, fold_performance):
    model.cuda()
    
    epoch_score = 0

    for epoch in range(num_epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            epoch_loss = 0.0
            epoch_jaccard = 0.0
            
            for data in (dataloaders_dict[phase]):
                ids = data['ids'].cuda()
                masks = data['masks'].cuda()
                token_type_ids = data['token_type_ids'].cuda()
                tweet = data['tweet']
                offsets = data['offsets'].numpy()
                start_idx = data['start_idx'].cuda()
                end_idx = data['end_idx'].cuda()

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):

                    start_logits, end_logits = model(ids, masks,token_type_ids)

                    loss = criterion(start_logits, end_logits, start_idx, end_idx)
                    
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    epoch_loss += loss.item() * len(ids)
                    
                    start_idx = start_idx.cpu().detach().numpy()
                    end_idx = end_idx.cpu().detach().numpy()
                    start_logits = torch.softmax(start_logits, dim=1).cpu().detach().numpy()
                    end_logits = torch.softmax(end_logits, dim=1).cpu().detach().numpy()
                    
                    for i in range(len(ids)):                        
                        jaccard_score = compute_jaccard_score(
                            tweet[i],
                            start_idx[i],
                            end_idx[i],
                            start_logits[i], 
                            end_logits[i], 
                            offsets[i])
                        epoch_jaccard += jaccard_score
                    
            epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
            epoch_jaccard = epoch_jaccard / len(dataloaders_dict[phase].dataset)
            
            if phase == 'val':
                epoch_score += epoch_jaccard
            
            print('Epoch {}/{} | {:^5} | Loss: {:.4f} | Jaccard: {:.4f}'.format(
                epoch + 1, num_epochs, phase, epoch_loss, epoch_jaccard))
    
    torch.save(model.state_dict(), filename)
    return epoch_score/num_epochs

#### Training

In [16]:
num_epochs = 5
batch_size = 32
skf = StratifiedKFold(n_splits=10, shuffle=True)

In [17]:
train_df = pd.read_csv(data_folder + '/train.csv')
train_df = train_df[train_df.text == train_df.text]
train_df['text'] = train_df['text'].astype(str)
train_df['selected_text'] = train_df['selected_text'].astype(str)
train_df['idx'] = train_df.index.copy()
fold_performance = {}


for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df.sentiment), start=1): 
    print(f'Fold: {fold}')

    model = TweetModel()
    optimizer = optim.AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999))
    criterion = loss_fn    
    dataloaders_dict = get_train_val_loaders(train_df, train_idx, val_idx, batch_size)
    #print(dataloaders_dict)


    epoch_score = train_model(
        model, 
        dataloaders_dict,
        criterion, 
        optimizer, 
        num_epochs,
        electra_folder + f'roberta_fold{fold}.pth',
        fold_performance)
    fold_performance[fold] = epoch_score

Fold: 1
Epoch 1/5 | train | Loss: 2.9026 | Jaccard: 0.6097
Epoch 1/5 |  val  | Loss: 1.8928 | Jaccard: 0.6805
Epoch 2/5 | train | Loss: 1.9302 | Jaccard: 0.6816
Epoch 2/5 |  val  | Loss: 1.7803 | Jaccard: 0.7000
Epoch 3/5 | train | Loss: 1.7793 | Jaccard: 0.6992
Epoch 3/5 |  val  | Loss: 1.7445 | Jaccard: 0.7053
Epoch 4/5 | train | Loss: 1.6717 | Jaccard: 0.7112
Epoch 4/5 |  val  | Loss: 1.7371 | Jaccard: 0.7034
Epoch 5/5 | train | Loss: 1.5853 | Jaccard: 0.7222
Epoch 5/5 |  val  | Loss: 1.7597 | Jaccard: 0.7112
Fold: 2
Epoch 1/5 | train | Loss: 2.8891 | Jaccard: 0.6055
Epoch 1/5 |  val  | Loss: 1.9199 | Jaccard: 0.6789
Epoch 2/5 | train | Loss: 1.9284 | Jaccard: 0.6818
Epoch 2/5 |  val  | Loss: 1.8025 | Jaccard: 0.6966
Epoch 3/5 | train | Loss: 1.7663 | Jaccard: 0.6992
Epoch 3/5 |  val  | Loss: 1.7687 | Jaccard: 0.6943
Epoch 4/5 | train | Loss: 1.6652 | Jaccard: 0.7161
Epoch 4/5 |  val  | Loss: 1.7553 | Jaccard: 0.6984
Epoch 5/5 | train | Loss: 1.5698 | Jaccard: 0.7241
Epoch 5/5 |  va

In [18]:
best_model = max(fold_performance, key=fold_performance.get)
print(best_model)

5


In [19]:
fold_performance

{1: 0.7000701779830472,
 2: 0.6934972208492283,
 3: 0.7010250886662777,
 4: 0.7031523762053492,
 5: 0.7049223708135355,
 6: 0.7041371048666087,
 7: 0.6996978620192177,
 8: 0.6992276094306504,
 9: 0.692510159375797,
 10: 0.6951331763146122}

#### Run the model against Test dataset

In [23]:
%%time

#test_df = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
test_df = pd.read_csv(data_folder + '/test.csv')
test_df['text'] = test_df['text'].astype(str)
test_df['idx'] = test_df.index.copy()
test_loader = get_test_loader(test_df)
predictions = []
models = []
for fold in range(skf.n_splits):
    model = TweetModel()
    model.cuda()
    model.load_state_dict(torch.load(electra_folder + f'roberta_fold{fold+1}.pth'))
    model.eval()
    models.append(model)

### Use the best model to predict the result
#model = TweetModel()
#model.cuda()
#model.load_state_dict(torch.load(f'roberta_fold{best_model}.pth'))
#model.eval()
#models.append(model)




for data in test_loader:
    ids = data['ids'].cuda()
    masks = data['masks'].cuda()
    token_type_ids = data['token_type_ids'].cuda()
    tweet = data['tweet']
    offsets = data['offsets'].numpy()
    

    start_logits = []
    end_logits = []
    for model in models:
        with torch.no_grad():
            output = model(ids, masks,token_type_ids)
            start_logits.append(torch.softmax(output[0], dim=1).cpu().detach().numpy())
            end_logits.append(torch.softmax(output[1], dim=1).cpu().detach().numpy())

    start_logits = np.mean(start_logits, axis=0)
    end_logits = np.mean(end_logits, axis=0)
    for i in range(len(ids)):    
        start_pred = np.argmax(start_logits[i])
        end_pred = np.argmax(end_logits[i])
        if start_pred > end_pred:
            pred = tweet[i]
        else:
            pred = get_selected_text(tweet[i], start_pred, end_pred, offsets[i])
        predictions.append(pred)

CPU times: user 33 s, sys: 5.02 s, total: 38 s
Wall time: 38.7 s


In [25]:
sub_df = pd.read_csv(data_folder + '/sample_submission.csv')
sub_df['selected_text'] = predictions
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('!!!!', '!') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('..', '.') if len(x.split())==1 else x)
sub_df['selected_text'] = sub_df['selected_text'].apply(lambda x: x.replace('...', '.') if len(x.split())==1 else x)
sub_df.to_csv('submission.csv', index=False)
sub_df.head()

Unnamed: 0,textID,selected_text
0,f87dea47db,last session of the day
1,96d74cb729,shanghai is also really exciting (precisely -...
2,eee518ae67,such a shame!
3,01082688c6,happy bday!
4,33987a8ee5,i like it!!
