In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from fastai.vision.all import *
import sklearn.metrics as skm
from tqdm.notebook import tqdm
import sklearn.feature_extraction.text
from transformers import (BertTokenizer, BertModel,
                          DistilBertTokenizer, DistilBertModel)

In [3]:
import debugpy
debugpy.listen(5678)

('127.0.0.1', 5678)

In [4]:
from shopee_utils import *

In [5]:
PATH = Path('../input/shopee-product-matching')
model_file  = '../input/resnet-model/resnet34.pth'
if not PATH.is_dir():
    PATH = Path('/home/slex/data/shopee')
    model_file ='models/resnet34.pth'

In [6]:
train_df = pd.read_csv(PATH/'train_split.csv')
train_df['is_valid'] = train_df.split==0

In [7]:
model_name='cahya/bert-base-indonesian-522M'
#model_name='cahya/distilbert-base-indonesian'

In [8]:
class ArcFaceClassifier(nn.Module):
    def __init__(self, in_features, output_classes):
        super().__init__()
        emb_dim=768
        mid_size = 512
        self.initial_layers=nn.Sequential(
#             nn.Linear(in_features, mid_size),
#             nn.ReLU(inplace=True),
#             nn.BatchNorm1d(mid_size),
#             nn.Dropout(),
#             nn.Linear(mid_size,emb_dim),
            nn.BatchNorm1d(emb_dim),
            nn.Dropout(.25))
        self.W = nn.Parameter(torch.Tensor(emb_dim, output_classes))
        nn.init.kaiming_uniform_(self.W)
    def forward(self, x):
        x = self.initial_layers(x)
        x_norm = F.normalize(x)
        W_norm = F.normalize(self.W, dim=0)
        return x_norm @ W_norm
    
    
def arcface_loss(cosine, targ, m=.5, s=30):
    cosine = cosine.clip(-1+1e-7, 1-1e-7) 
    arcosine = cosine.arccos()
    arcosine += F.one_hot(targ, num_classes = dls.c) * m
    cosine2 = arcosine.cos()
    cosine2 *= s
    return F.cross_entropy(cosine2, targ)

In [9]:
class MyModel(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert_model = bert_model
        self.classifier = ArcFaceClassifier(768, dls.c)
    def forward(self, x):
        output = self.bert_model(*x)
        embeddings = output.last_hidden_state[:,0,:]
        return self.classifier(embeddings)

In [10]:
class EmbsModel(nn.Module):
    def __init__(self, bert_model, classifier):
        super().__init__()
        self.bert_model = bert_model
        self.classifier_layers = classifier.initial_layers
    def forward(self, x):
        output = self.bert_model(*x)
        embeddings = output.last_hidden_state[:,0,:]
        #embeddings = self.classifier_layers(embeddings)
        return embeddings

In [11]:
class TitleTransform(Transform):
    def __init__(self):
        super().__init__()
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        
        
    def encodes(self, row):
        text = row.title
        encodings = self.tokenizer(text, padding = 'max_length', max_length=50, truncation=True,return_tensors='pt')
        keys =['input_ids', 'attention_mask', 'token_type_ids'] 
        return tuple(encodings[key].squeeze() for key in keys)

In [19]:
tfm = TitleTransform()

data_block = DataBlock(
    blocks = (TransformBlock(type_tfms=tfm), 
              CategoryBlock(vocab=train_df.label_group.to_list())),
    splitter=ColSplitter(),
    get_y=ColReader('label_group'),
    )
dls = data_block.dataloaders(train_df, bs=256,num_workers=16)


In [13]:
def do_chunk(embs):
    step = 10000
    for chunk_start in range(0, embs.shape[0], step):
        chunk_end = min(chunk_start+step, len(embs))
        yield embs[chunk_start:chunk_end]

In [14]:
def new_model():
    bert_model = BertModel.from_pretrained(model_name)
    return MyModel(bert_model)

In [15]:
def split_3way(model):
    return L(params(model.bert_model.embeddings),
            params(model.bert_model.encoder) + params(model.bert_model.pooler), 
            params(model.classifier))
def split_2way(model):
    return L(params(model.bert_model),
            params(model.classifier))

In [16]:
def count_params(parameters):
    res =0 
    for p in parameters:
        res+=p.numel()
    return res

# count_params(learn.model.parameters())

# splitted = split_2way(learn.model)
# parts =[count_params(m) /1e6 for m in splitted]
# print(parts, sum(parts))

In [36]:
learn = Learner(dls,new_model(),  splitter=split_2way, loss_func=arcface_loss)

In [None]:
learn.fine_tune(15, 1e-2, freeze_epochs=2)

epoch,train_loss,valid_loss,time
0,21.140066,26.34971,00:36
1,16.573906,28.773136,00:35


epoch,train_loss,valid_loss,time
0,12.491858,28.489353,00:50
1,11.108472,29.147408,00:49
2,10.006044,29.501461,00:49
3,9.03965,29.936142,00:48
4,8.08031,30.267073,00:48
5,7.075113,30.504175,00:47
6,6.181442,30.712891,00:48


In [30]:
learn.load('bert814val')

<fastai.learner.Learner at 0x7f8a4127af70>

In [87]:
learn.save('bert814val')

Path('models/bert814val.pth')

In [43]:
bert_model = learn.model.bert_model

In [45]:
torch.save(bert_model.state_dict(), 'models/bert_model.pth')

## Validataion set

In [None]:
embs_model = EmbsModel(learn.model.bert_model, learn.model.classifier).cuda().eval()

In [None]:
valid_embs, _ = embs_from_model(embs_model, dls.valid)

In [None]:
dists, inds = get_nearest(valid_embs, do_chunk(valid_embs))

In [None]:
valid_df=train_df[train_df.is_valid==True].copy().reset_index()
valid_df = add_target_groups(valid_df)

In [None]:
pairs = sorted_pairs(dists, inds)[:len(valid_df)*10]

In [None]:
_=build_from_pairs(pairs, valid_df.target.to_list())