In [17]:
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import (
  BertTokenizerFast,
  AutoModel,
)
from customized.preprocess import BertTokenDataset

In [2]:
prod = pd.read_pickle('data/prod.pkl')
item_names = list(set(prod.name))
item_max_len = max([len(i) for i in item_names])

### pre-trained BERT

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
bert_model = AutoModel.from_pretrained('ckiplab/bert-base-chinese').to(device)

tokenizer_result = tokenizer(item_names, max_length=item_max_len, padding=True, return_attention_mask=True, return_tensors='pt')
input_ids = tokenizer_result.input_ids
attention_mask = tokenizer_result.attention_mask
print(input_ids.shape)

token_dataset = BertTokenDataset(input_ids)
token_dataloader = torch.utils.data.DataLoader(token_dataset, shuffle=False, batch_size=32)

item_embeddings = []

for i, x in enumerate(token_dataloader):
    x = x.to(device)
    embedding = bert_model(x).pooler_output
    embedding = embedding.tolist()
    item_embeddings.extend(embedding)
    if (i+1) % 500 == 0:
        print("Step [{}/{}]" .format(i+1, len(token_dataloader)))

item_embeddings = np.array(item_embeddings)
print(item_embeddings.shape)

Some weights of the model checkpoint at ckiplab/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dens

torch.Size([247092, 94])
Step [500/7722]
Step [1000/7722]
Step [1500/7722]
Step [2000/7722]
Step [2500/7722]
Step [3000/7722]
Step [3500/7722]
Step [4000/7722]
Step [4500/7722]
Step [5000/7722]
Step [5500/7722]
Step [6000/7722]
Step [6500/7722]
Step [7000/7722]
Step [7500/7722]
(247092, 768)


In [4]:
item2vec = {name:vec for name, vec in zip(item_names, item_embeddings)}

In [15]:
list(item2vec.keys())[:5]

['3S-3173 B5',
 '【加購0603】GAP細大logo短T(9900黑/9901灰))',
 '公主 0326 A14-BC冰冰防蚊乳 (50ml)*1',
 '大蘋 0416 A02-(2件組)帝喜牌 手打鐵鍋 (32cm)*1+dstt 微壓悶燒鍋 (3.5L)*1',
 '第 010 標 (2/9)韓國 SUNTIQUE 遮瑕BB防曬棒']

In [6]:
item2vec['【商城】澳洲Eureka天然10%水溶性尤加利精油500ml'].shape

(768,)

In [18]:
with open('data/item2vec.pkl', 'wb') as f:
    pickle.dump(item2vec, f)

In [34]:
test['【商城】澳洲Eureka天然10%水溶性尤加利精油500ml'].shape

(768,)

In [39]:
with open('data/item2vec.pkl', 'rb') as f:
    test = pickle.load(f)
from deepdiff import DeepDiff
print(DeepDiff(test, item2vec))

FileNotFoundError: [Errno 2] No such file or directory: 'data/item2vec.pkl'