In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from torch import nn
import torch.nn.functional as F
import torchmetrics 
from torch.optim.lr_scheduler import _LRScheduler
from torch.nn.modules.loss import _Loss

## Arcface should be trained for the model which is not used for explicitly classification inference (without labeling)

In [2]:
import transformers
from transformers import AutoTokenizer, AutoModel

In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import pandas as pd
import numpy as np
import random 
import math
from tqdm import tqdm


In [4]:
class GlobalConstantsConfigure():
    def __init__(self):
        
        self.is_debug = True # False
        self.run_name = ''# 'finetune_no_selfie_resnet50_0.08_cifar10_cosface'
        self.id_num = 22
        self.debug_num = 28
        self.continue_training = False
        self.num_epochs_done = 0
        self.last_model = '../input/selfie-transformer-pretraining-models/pretraining-new_selfie_add_label_smoothing_19.pt'
        self.pl_dir = '../input/imdb-dataset-csv-psuedo-labeling/pl_df.csv'
        self.original_testing_dir = '../input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv'

        self.seed = 127 # 107
        self.fold = 1
        self.num_folds = 4
        self.biggest_loss = np.Inf 

        
        self.training_dir = ''
        self.model_name = 'xlm-roberta-base' #'tf_efficientnet_b4'
        self.model_path = f'./{self.model_name}.pt'
        
        
        self.lr = 1e-6 # 1e-6
        self.lr_decay_rate = 0.8
        self.num_epochs = 1 # 70 
        self.batch_size = 64 # 32 # 16
        self.val_batch_size = 256
        self.label_smoothing = 0.0
        self.num_epochs_per_decay = 20
        self.patch_size = 14
        
        self.num_classes = 2
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.num_workers = 2  # if torch.cuda.is_available() else 4
        
        self.max_length = 128
        self.scale = 30
        self.margin = 0.5
        self.fc_dim = 768
        self.in_features = 768
        
        self.inference = True
        self.psuedo_ratio = 0.4
        self.first_run = False
        self.length_test_df = 5000
        
gcc = GlobalConstantsConfigure()

In [5]:
class CosineAnnealingLRDecay(_LRScheduler):

    def __init__(self, optimizer, T_max, decay_coef=0.8, eta_min=1e-9, last_epoch=-1, verbose=False):
        if T_max <= 0 or not isinstance(T_max, int):
            raise ValueError("Expected positive integer T_0, but got {}".format(T_0))
        if eta_min < 0 :
            raise ValueError("Expected positive eta_min")
        self.T_max = T_max
        self.eta_min = eta_min
        self.T_cur = last_epoch
        self.decay_coef = decay_coef
        super().__init__(optimizer, last_epoch, verbose)

    def get_lr(self):
        if not self._get_lr_called_within_step:
            warnings.warn("To get the last learning rate computed by the scheduler, "
                          "please use `get_last_lr()`.", UserWarning)
        if self.last_epoch == 0:
              return [group['lr'] for group in self.optimizer.param_groups]
            
        T_cur = self.last_epoch# T_cur start from 1 
        lr_decay_coef = math.pow(self.decay_coef, T_cur / self.T_max)
        # print(T_cur, lr_decay_coef)
        return [self.eta_min + (base_lr - self.eta_min) * 1 / 2 * (1 + math.cos(math.pi * (T_cur % self.T_max) / self.T_max)) * lr_decay_coef
                for base_lr in self.base_lrs]

In [6]:
class SentenceDataset(Dataset):
    def __init__(self, df, text_column, label_column):
        texts = df[text_column]
        self.labels = df[label_column].values.astype('float')
        
        self.titles = []
        for title in texts:
            title = title.encode('utf-8').decode("unicode_escape")
            title = title.encode('ascii', 'ignore').decode("unicode_escape")
            title = title.lower()
            self.titles.append(title)

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        text = self.titles[idx]
        label = torch.tensor(self.labels[idx])
        return text, label
    
    
class SentenceInferenceDataset(Dataset):
    def __init__(self, df, text_column):
        texts = df[text_column]
        
        self.titles = []
        for title in texts:
            title = title.encode('utf-8').decode("unicode_escape")
            title = title.encode('ascii', 'ignore').decode("unicode_escape")
            title = title.lower()
            self.titles.append(title)

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        text = self.titles[idx]
        return text

In [7]:
training_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv').sample(300)
validation_df = pd.read_csv('../input/imdb-dataset-sentiment-analysis-in-csv-format/Valid.csv')

In [8]:
training_df

Unnamed: 0,text,label
18569,I don't know whether to recommend this movie t...,1
9835,"Loved Part One, The Impossible Planet, but who...",0
15111,"Saw this as previous viewer by accident, I hav...",1
15051,i love watching the Jericho mile. i mean watch...,1
10618,"I searched for this movie for years, apparentl...",1
...,...,...
12403,"Of course, seeing a few boom mikes doesn't mea...",1
27163,"I love bad movies. Not only, because they ofte...",1
21087,may contain spoilers!!!! so i watched this mov...,0
6853,Stories about the possibility of a post-apocal...,1


In [9]:
training_df.describe()

Unnamed: 0,label
count,300.0
mean,0.496667
std,0.500824
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [10]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 18569 to 16166
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    300 non-null    object
 1   label   300 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ KB


In [11]:
if not gcc.first_run:
    pl_df = pd.read_csv(gcc.pl_dir)
    print('Number of available samples: ', len(pl_df[pl_df['is_available'] == 1]))
    pl_df_avail = pl_df[pl_df['is_available'] == 1].copy().reset_index(drop=True)
    pl_df_unavail = pl_df[pl_df['is_available'] == 0].copy().reset_index(drop=True)
    
    # print(len(pl_df_avail))
    # print(len(pl_df_unavail))

    pl_training_df = pl_df_avail[['text', 'preds']].copy()
    pl_training_df.rename(columns={'preds': 'label'}, inplace=True)
    
    training_df = pd.concat([training_df, pl_training_df]) 

Number of available samples:  2000


In [12]:
train_dataset = SentenceDataset(training_df, 'text', 'label')
training_dataloader = DataLoader(
    train_dataset,
    batch_size = gcc.batch_size,
    num_workers = gcc.num_workers,
    pin_memory = True,
    shuffle = True,
    drop_last = True
)


valid_dataset = SentenceDataset(validation_df, 'text', 'label')
validation_dataloader = DataLoader(
    valid_dataset,
    batch_size = gcc.batch_size,
    num_workers = gcc.num_workers,
    pin_memory = True,
    shuffle = False,
    drop_last = False
)

  
  if __name__ == "__main__":
  
  if __name__ == "__main__":
  
  if __name__ == "__main__":


In [13]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [14]:
class SentimentAnalysisModel(nn.Module):
    def __init__(self, num_classes=gcc.num_classes, model_name=gcc.model_name, fc_dim=gcc.fc_dim, in_features=gcc.in_features, max_length=gcc.max_length):
        super().__init__()
        self.bert_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.bert_model = AutoModel.from_pretrained(model_name).to(gcc.device)
        self.max_length = max_length
        self.last_seq_fc = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features, fc_dim),
            nn.BatchNorm1d(fc_dim)
        )
        
        
        self.classifier = nn.Sequential(
            nn.Linear(fc_dim, num_classes)
        )
        
    
    def get_params(self):
        g1 = [param for param in self.bert_model.parameters() if param.requires_grad]
        g2 = [param for param in self.last_seq_fc.parameters() if param.requires_grad] \
            + [param for param in self.classifier.parameters() if param.requires_grad] 

        return g1, g2
        
    
    def feature_extractor(self, texts):
        encoding = self.bert_tokenizer(texts, 
                                  padding=True, 
                                  truncation=True, 
                                  max_length = self.max_length,
                                  return_tensors='pt'
                                 ).to(gcc.device) # why need to send to device here
        input_ids = encoding['input_ids'] # convert word to id
        attention_mask = encoding['attention_mask'] # attention mask
        embedding = self.bert_model(input_ids, attention_mask = attention_mask)
        x = mean_pooling(embedding, attention_mask)
        x = self.last_seq_fc(x)
        return x
    
    def forward(self, texts):
        features = self.feature_extractor(texts)
        x = self.classifier(features)
        return x
    

In [15]:
def training_progress(model, training_dataloader, scheduler, loss_func):
    model.train()
    
    m_loss = 0.
    m_acc = 0.
    
    cnt = 0 
    
    training_loader = tqdm(training_dataloader, desc='Iterating through the training set')
    
    for texts, labels in training_loader:
        texts = list(texts)
        labels = labels.type(torch.LongTensor).to(gcc.device)
        preds = model(texts)
        loss = loss_func(preds, labels) 
        acc = metrics_acc(preds, labels)
        
        m_loss += loss.detach().item()
        m_acc += acc.detach().item()

        cnt += 1 
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    scheduler.step()
    
    
    mean_training_loss = m_loss / cnt # running_loss / len(training_dataloader.dataset)
    mean_training_acc = m_acc / cnt # running_corrects.double() / (len(training_dataloader.dataset) * gcc.num_targets)
    
    return mean_training_loss, mean_training_acc

In [16]:
def validation_progress(model, validation_dataloader, scheduler, loss_func):
    model.eval()
    cnt = 0 
    m_loss = 0.
    m_acc = 0.
    
    validation_loader = tqdm(validation_dataloader, desc='Iterating through the validation set')
    
    with torch.no_grad():
        for texts, labels in validation_loader:
            texts = list(texts)
            labels = labels.type(torch.LongTensor).to(gcc.device)

            preds = model(texts)
            loss = loss_func(preds, labels)
            acc = metrics_acc(preds, labels)

            m_loss += loss.detach().item()
            m_acc += acc.detach().item()

            cnt += 1 
        
    mean_validation_loss = m_loss / cnt 
    mean_validation_acc = m_acc / cnt 

    return mean_validation_loss, mean_validation_acc

In [17]:
def training_model(model, training_dataloader, validation_dataloader, scheduler, loss_func):
    training_losses_history, validation_losses_history = [], []
    training_acc_history, validation_acc_history = [], []
    best_loss = gcc.biggest_loss
    for epoch in range(gcc.num_epochs):
        
        print(f'Training epoch {epoch + 1}/{gcc.num_epochs}, learning rate: {scheduler.get_last_lr()[0]}')
        
        training_loss, training_acc = training_progress(model, training_dataloader, scheduler, loss_func)
        training_losses_history.append(training_loss)
        training_acc_history.append(training_acc)

        validation_loss, validation_acc = validation_progress(model, validation_dataloader, scheduler, loss_func)
        validation_losses_history.append(validation_loss)
        validation_acc_history.append(validation_acc)
        
        

        if validation_loss <= best_loss: # sussy baka
            best_loss = validation_loss
            print('Saved new best selfie model')

            torch.save({ # save the selfie model and pmodel seperately
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict()
            }, gcc.model_name + '_bert_best.pt')
            
        if epoch == gcc.num_epochs - 1: # i believe my timing capability
            print('Saved last bert model')
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict()
            }, gcc.model_name + '_' + str(gcc.num_epochs_done + gcc.num_epochs) + '_bert_last.pt')
        

        print(f'Results: Epoch {epoch + 1}/{gcc.num_epochs} | Training_loss : {training_loss:.5f} | Training_acc : {training_acc:.3f} | Validation_loss : {validation_loss:.5f} | Validation_acc : {validation_acc:.3f}\n')
    return training_losses_history, training_acc_history, validation_losses_history, validation_acc_history

In [18]:
metrics_acc = torchmetrics.Accuracy(threshold=0.0, num_classes = gcc.num_classes).to(gcc.device)
loss_func = torch.nn.CrossEntropyLoss()
model = SentimentAnalysisModel()
model.to(gcc.device)


g1, g2 = model.get_params()
optimizer = AdamW([
                {'params': g1, 'lr': gcc.lr, 'weight_decay' : 0.},
                {'params': g2, 'lr': gcc.lr * 2, 'weight_decay' : 0.}
            ])
scheduler = CosineAnnealingLRDecay(optimizer, decay_coef=0.9,  T_max=20)

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [19]:
training_losses_history, training_acc_history, validation_losses_history, validation_acc_history = training_model(model, training_dataloader, validation_dataloader, scheduler, loss_func)

Training epoch 1/1, learning rate: 1e-06


Iterating through the training set: 100%|██████████| 35/35 [00:29<00:00,  1.21it/s]
Iterating through the validation set: 100%|██████████| 79/79 [00:23<00:00,  3.37it/s]


Saved new best selfie model
Saved last bert model
Results: Epoch 1/1 | Training_loss : 0.73070 | Training_acc : 0.582 | Validation_loss : 0.69764 | Validation_acc : 0.493



In [20]:
def inference(model, testing_dataloader):
    model.eval()
        
    labels_cmp = np.array([])
    preds_cmp = np.array([])
    probs_cmp = np.array([])

    dirs_img = []
    cnt = 0 

    testing_loader = tqdm(testing_dataloader, desc=f'Iterating through the testing set')

    with torch.no_grad():
        for texts in testing_loader:
            texts = list(texts)

            preds = model(texts)

            preds_cmp = np.append(preds_cmp, preds.argmax(1).cpu().numpy())
            probs = torch.max(preds.cpu(), dim=1)[0]
            probs_cmp = np.append(probs_cmp, probs.numpy())
            
            labels_cmp = np.append(labels_cmp, preds.cpu().numpy())


            cnt += 1 
        
    return preds_cmp, probs_cmp

In [21]:
if gcc.inference:
    
    
    if gcc.first_run: 
        testing_df = pd.read_csv(gcc.original_testing_dir)
        testing_df['is_available'] = 0
    else: 
        testing_df = pl_df_unavail
        print(testing_df)
         # point to the old data
        
    
    test_dataset = SentenceInferenceDataset(testing_df, 'text')
    testing_dataloader = DataLoader(
        test_dataset,
        batch_size = gcc.batch_size,
        num_workers = gcc.num_workers,
        pin_memory = True,
        shuffle = False,
        drop_last = False
    )
    
    checkpoint = torch.load(gcc.model_name + '_bert_best.pt')
    model.load_state_dict(checkpoint['model_state_dict'])
    preds_cmp, probs_cmp = inference(model, testing_dataloader)
    testing_df['preds'] = preds_cmp    
    testing_df['probs'] = probs_cmp    
     


                                                   text  label  is_available  \
0     After a slow beginning, BRUCE ALMIGHTY is a ve...      1             0   
1     I'm not a sports fan - but I love sports flics...      1             0   
2     The John Goodman program was pretty awful, but...      0             0   
3     Oddly, I have very little to say about "The Bo...      0             0   
4     As a late-going patron of the drive-in thearer...      1             0   
...                                                 ...    ...           ...   
2995  Why would anyone make a film like this? Why wo...      0             0   
2996  This movie is by far one of the worst B-movies...      0             0   
2997  <br /><br />Upon concluding my viewing of "Tra...      0             0   
2998  The Mod Squad isn't a movie, it's a void. That...      0             0   
2999  As a lesbian, I am always on the lookout for f...      0             0   

      preds     probs  
0       1.0 -0.

Iterating through the testing set: 100%|██████████| 47/47 [00:13<00:00,  3.40it/s]


In [22]:
testing_df.sort_values(by=['probs'], ascending=False, inplace=True)
testing_df = testing_df.reset_index(drop=True)

In [23]:
testing_df['is_available'][: int(gcc.psuedo_ratio * gcc.length_test_df)] = 1
testing_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,label,is_available,preds,probs
0,<br /><br />It sucked.<br /><br />I returned t...,0,1,0.0,0.395209
1,I read ashew's comment and thought they must h...,1,1,0.0,0.378520
2,I didn't see such a movie where the creators p...,1,1,0.0,0.369896
3,This is possibly the single worst film i have ...,0,1,0.0,0.366993
4,This film is terrible. I was really looking fo...,0,1,0.0,0.356997
...,...,...,...,...,...
2995,This movie was dreadful. Biblically very inacc...,0,0,0.0,0.138555
2996,In the rapid economic development of 1990's in...,1,0,0.0,0.132932
2997,MacArthur is a great movie with a great story ...,1,0,0.0,0.132007
2998,If you want an excellent survey of Byzantine h...,1,0,0.0,0.118138


In [24]:
len(testing_df[testing_df['is_available'] == 1])

2000

In [25]:
len(testing_df)

3000

In [26]:
len(pl_df[pl_df['is_available'] == 1])

2000

In [27]:
len(pl_df[pl_df['is_available'] == 0])

3000

In [28]:
if not gcc.first_run: 
    new_pl_df = pd.concat([pl_df_avail, testing_df])
    print('Number of available samples: ', len(new_pl_df[new_pl_df['is_available'] == 1]))
    print('New pl df size: ', len(new_pl_df))

    new_pl_df.to_csv('pl_df.csv')
else:
    print(testing_df)
    testing_df.to_csv('pl_df.csv', index=False)
    testing_df

Number of available samples:  4000
New pl df size:  5000
