In [None]:
%%bash
pip install -q transformers

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path 

import os

import torch
import torch.optim as optim

import random 

# fastai
from fastai import *
from fastai.text import *
from fastai.callbacks import *

# transformers
from transformers import PreTrainedModel, PreTrainedTokenizer, PretrainedConfig

from transformers import BertForSequenceClassification, BertTokenizer, BertConfig
from transformers import RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig
from transformers import XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig
from transformers import XLMForSequenceClassification, XLMTokenizer, XLMConfig
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig
from transformers import AlbertConfig, AlbertModel, AlbertTokenizer

In [None]:
import fastai
import transformers
print('fastai version :', fastai.__version__)
print('transformers version :', transformers.__version__)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
IMDB_dataset = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
train,test = train_test_split(IMDB_dataset, test_size = 0.5)

print(train.shape,test.shape)

In [None]:
MODEL_CLASSES = {
    'bert': (BertForSequenceClassification, BertTokenizer, BertConfig),
    'xlnet': (XLNetForSequenceClassification, XLNetTokenizer, XLNetConfig),
    'xlm': (XLMForSequenceClassification, XLMTokenizer, XLMConfig),
    'roberta': (RobertaForSequenceClassification, RobertaTokenizer, RobertaConfig),
    'distilbert': (DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig),
    'albert': (AlbertModel, AlbertTokenizer, AlbertConfig)
}

In [None]:
# Parameters
seed = 42
use_fp16 = False
bs = 10


model_type = 'albert'
pretrained_model_name = 'albert-base-v2'

# model_type = 'roberta'
# pretrained_model_name = 'roberta-large'

# model_type = 'bert'
# pretrained_model_name='bert-base-uncased'

# model_type = 'distilbert'
# pretrained_model_name = 'distilbert-base-uncased'

#model_type = 'xlm'
#pretrained_model_name = 'xlm-clm-enfr-1024'

# model_type = 'xlnet'
# pretrained_model_name = 'xlnet-base-cased'

In [None]:
model_class, tokenizer_class, config_class = MODEL_CLASSES[model_type]

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) # Python
    np.random.seed(seed_value) # cpu vars
    torch.manual_seed(seed_value) # cpu  vars
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [None]:
seed_all(seed)

In [None]:
class TransformersBaseTokenizer(BaseTokenizer):
    """Wrapper around PreTrainedTokenizer to be compatible with fast.ai"""
    def __init__(self, pretrained_tokenizer: PreTrainedTokenizer, model_type = 'bert', **kwargs):
        self._pretrained_tokenizer = pretrained_tokenizer
        self.max_seq_len = pretrained_tokenizer.max_len
        self.model_type = model_type

    def __call__(self, *args, **kwargs): 
        return self

    def tokenizer(self, t:str) -> List[str]:
        """Limits the maximum sequence length and add the spesial tokens"""
        CLS = self._pretrained_tokenizer.cls_token
        SEP = self._pretrained_tokenizer.sep_token
        if self.model_type in ['roberta']:
            tokens = self._pretrained_tokenizer.tokenize(t, add_prefix_space=True)[:self.max_seq_len - 2]
            tokens = [CLS] + tokens + [SEP]
        else:
            tokens = self._pretrained_tokenizer.tokenize(t)[:self.max_seq_len - 2]
            if self.model_type in ['xlnet']:
                tokens = tokens + [SEP] +  [CLS]
            else:
                tokens = [CLS] + tokens + [SEP]
        return tokens

In [None]:
# transformer_tokenizer = tokenizer_class.from_pretrained("/kaggle/input/large-roberta")
transformer_tokenizer = tokenizer_class.from_pretrained(pretrained_model_name)
transformer_base_tokenizer = TransformersBaseTokenizer(pretrained_tokenizer = transformer_tokenizer, model_type = model_type)
fastai_tokenizer = Tokenizer(tok_func = transformer_base_tokenizer, pre_rules=[], post_rules=[])

In [None]:
class TransformersVocab(Vocab):
    def __init__(self, tokenizer: PreTrainedTokenizer):
        super(TransformersVocab, self).__init__(itos = [])
        self.tokenizer = tokenizer
    
    def numericalize(self, t:Collection[str]) -> List[int]:
        "Convert a list of tokens `t` to their ids."
        return self.tokenizer.convert_tokens_to_ids(t)
        #return self.tokenizer.encode(t)

    def textify(self, nums:Collection[int], sep=' ') -> List[str]:
        "Convert a list of `nums` to their tokens."
        nums = np.array(nums).tolist()
        return sep.join(self.tokenizer.convert_ids_to_tokens(nums)) if sep is not None else self.tokenizer.convert_ids_to_tokens(nums)
    
    def __getstate__(self):
        return {'itos':self.itos, 'tokenizer':self.tokenizer}

    def __setstate__(self, state:dict):
        self.itos = state['itos']
        self.tokenizer = state['tokenizer']
        self.stoi = collections.defaultdict(int,{v:k for k,v in enumerate(self.itos)})

In [None]:
transformer_vocab =  TransformersVocab(tokenizer = transformer_tokenizer)
numericalize_processor = NumericalizeProcessor(vocab=transformer_vocab)

tokenize_processor = TokenizeProcessor(tokenizer=fastai_tokenizer, include_bos=False, include_eos=False)

transformer_processor = [tokenize_processor, numericalize_processor]

In [None]:
pad_first = bool(model_type in ['xlnet'])
pad_idx = transformer_tokenizer.pad_token_id
tokens = transformer_tokenizer.tokenize('Salut c est moi, Hello it s me')
print(tokens)
ids = transformer_tokenizer.convert_tokens_to_ids(tokens)
print(ids)
transformer_tokenizer.convert_ids_to_tokens(ids)


In [None]:
databunch = (TextList.from_df(IMDB_dataset, cols='review', processor=transformer_processor)
             .split_by_rand_pct(0.5,seed=seed)
             .label_from_df(cols= 'sentiment')
             .databunch(bs=bs, pad_first=pad_first, pad_idx=pad_idx))

In [None]:
import torch.tensor as Tensor
def dropout_mask(x:Tensor, sz, p:float):
    "Return a dropout mask of the same type as x, size sz, with probability p to cancel an element."
    return x.new(*sz).bernoulli_(1-p).div_(1-p)

class RNNDropout(nn.Module):
    "Dropout with probability p that is consistent on the seq_len dimension."

    def __init__(self, p:float=0.5): 
        super(RNNDropout, self).__init__()
        self.p=p

    def forward(self, x:Tensor)->Tensor:
        if not self.training or self.p == 0.: return x
        m = dropout_mask(x.data, (x.size(0), 1, x.size(2)), self.p)
        return x * m

class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."

    def __init__(self, module:nn.Module, weight_p:float, layer_names=['weight_hh_l0']):
        super(WeightDropout, self).__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            # Registered in list of parameters
            self.register_parameter(str(layer)+'_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)

    def _setweights(self):
        "Apply dropout to the raw weights."
        for layer in self.layer_names:
            raw_w = getattr(self, str(layer) +'_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

    def reset(self):
        for layer in self.layer_names:
            raw_w = getattr(self, str(layer) + '_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=False)
        if hasattr(self.module, 'reset'): self.module.reset()

In [None]:
# defining our model architecture 
class CustomTransformerModel(nn.Module):
    def __init__(self, transformer_model: PreTrainedModel):
        super(CustomTransformerModel,self).__init__()
        self.transformer = transformer_model
        self.transformer = self.transformer
        self.rnns = [nn.LSTM(768 if l == 0 else 1152,
                            (1152 if l != 2-1 else 768)//2,
                            1, bidirectional = True, batch_first = True) for l in range(2)]
        
        
        self.rnns = [WeightDropout(rnn, 0.4) for rnn in self.rnns]
        self.rnns = nn.ModuleList(self.rnns)
        self.hidden_dps = nn.ModuleList([RNNDropout(0.5) for l in range(2)])
        self.out = nn.Linear(768,2)
        self.dropout = nn.Dropout(0.3)
        self.bs = bs
        
    def forward(self, input_ids, attention_mask=None):
        self.bs =input_ids.shape[0]
        self.reset()
        attention_mask = (input_ids!=pad_idx).type(input_ids.type()) 
        
        raw_output = self.transformer(input_ids,
                                  attention_mask = attention_mask)[0]
#         print(raw_output.shape)
# #         hidden, _ = self.rnns(logits)
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns,self.hidden_dps)):
            raw_output,_ = rnn(raw_output, self.hidden[l])
            if l != 2-1:raw_output = hid_dp(raw_output)
        output = self.out(self.dropout(raw_output))
        return output.mean(1).squeeze()
    def _one_hidden(self, l:int)->Tensor:
        "Return one hidden state."
        nh = (1152 if l != 1 else 768) //2
        return torch.zeros(2,self.bs,nh).cuda()
    def reset(self):
        "Reset the hidden states."
        [r.reset() for r in self.rnns if hasattr(r, 'reset')]
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(2)]

In [None]:
# from transformers import RobertaTokenizer, RobertaModel
transformer_model = model_class.from_pretrained("albert-base-v2")
# transformer_model = RobertaModel.from_pretrained("roberta-base")

In [None]:
custom_transformer_model = CustomTransformerModel(transformer_model = transformer_model)
custom_transformer_model

In [None]:
from fastai.callbacks import *
from transformers import AdamW
from functools import partial

CustomAdamW = partial(AdamW, correct_bias=False)

learner = Learner(databunch, 
                  custom_transformer_model, 
                  opt_func = CustomAdamW, 
                  metrics=[accuracy, error_rate])

# Show graph of learner stats and metrics after each epoch.
learner.callbacks.append(ShowGraph(learner))

# Put learn in FP16 precision mode. --> Seems to not working
# if use_fp16: learner = learner.to_fp16()

In [None]:
# For DistilBERT
# list_layers = [learner.model.transformer.embeddings,
#                learner.model.transformer.transformer.layer[0],
#                learner.model.transformer.transformer.layer[1],
#                learner.model.transformer.transformer.layer[2],
#                learner.model.transformer.transformer.layer[3],
#                learner.model.transformer.transformer.layer[4],
#                learner.model.transformer.transformer.layer[5]]

# For xlnet-base-cased
# list_layers = [learner.model.transformer.word_embedding,
#               learner.model.transformer.layer[0],
#               learner.model.transformer.layer[1],
#               learner.model.transformer.layer[2],
#               learner.model.transformer.layer[3],
#               learner.model.transformer.layer[4],
#               learner.model.transformer.layer[5],
#               learner.model.transformer.layer[6],
#               learner.model.transformer.layer[7],
#               learner.model.transformer.layer[8],
#               learner.model.transformer.layer[9],
#               learner.model.transformer.layer[10],
#               learner.model.transformer.layer[11]]

# For roberta-base
# list_layers = [learner.model.transformer.embeddings,
#               learner.model.transformer.encoder.layer[0],
#               learner.model.transformer.encoder.layer[1],
#               learner.model.transformer.encoder.layer[2],
#               learner.model.transformer.encoder.layer[3],
#               learner.model.transformer.encoder.layer[4],
#               learner.model.transformer.encoder.layer[5],
#               learner.model.transformer.encoder.layer[6],
#               learner.model.transformer.encoder.layer[7],
#               learner.model.transformer.encoder.layer[8],
#               learner.model.transformer.encoder.layer[9],
#               learner.model.transformer.encoder.layer[10],
#               learner.model.transformer.encoder.layer[11],
#               learner.model.transformer.encoder.layer[12],
#               learner.model.transformer.encoder.layer[13],
#               learner.model.transformer.encoder.layer[14],
#               learner.model.transformer.encoder.layer[15],
#               learner.model.transformer.encoder.layer[16],
#               learner.model.transformer.encoder.layer[17],
#               learner.model.transformer.encoder.layer[18],
#               learner.model.transformer.encoder.layer[19],
#               learner.model.transformer.encoder.layer[20],
#               learner.model.transformer.encoder.layer[21],
#               learner.model.transformer.encoder.layer[22],
#               learner.model.transformer.encoder.layer[23],
#               learner.model.transformer.pooler,
#               learner.model.rnns]


list_layers = [
    learner.model.transformer.embeddings,
    learner.model.transformer.encoder,
    learner.model.transformer.pooler,
    learner.model.rnns,
    
]

In [None]:
learner.split(list_layers)
num_groups = len(learner.layer_groups)
print('Learner split in',num_groups,'groups')
print(learner.layer_groups)


In [None]:
learner.save('untrain')
seed_all(seed)
learner.load('untrain');

In [None]:
learner.model.reset()
learner.freeze_to(-1)

In [None]:
learner.lr_find()
learner.recorder.plot(skip_end=10,suggestion=True)

In [None]:
learner.fit_one_cycle(1,max_lr=3e-4,moms=(0.8,0.7))

In [None]:
learner.freeze_to(-2)
learner.lr_find()
learner.recorder.plot(suggestion=True)

In [None]:
lr = 1e-4
learner.fit_one_cycle(1, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.freeze_to(-3)
learner.fit_one_cycle(1, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.unfreeze()
# learner.lr_find()
# learner.recorder.plot(suggestion=True)

In [None]:
lr = 3e-5
# learner.fit_one_cycle(2, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.8, 0.9))

In [None]:
learner.fit_one_cycle(5, max_lr=slice(lr*0.95**num_groups, lr), moms=(0.85, 0.95))