In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from transformers import (
  BertTokenizerFast,
  AutoModel,
)
from customized.preprocess import BertTokenDataset

In [2]:
prod = pd.read_pickle('data/prod.pkl')
item_names = list(set(prod.name))
item_max_len = max([len(i) for i in item_names])

### pre-trained BERT

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
bert_model = AutoModel.from_pretrained('ckiplab/bert-base-chinese').to(device)

tokenizer_result = tokenizer(item_names, max_length=item_max_len, padding=True, return_attention_mask=True, return_tensors='pt')
input_ids = tokenizer_result.input_ids
attention_mask = tokenizer_result.attention_mask
print(input_ids.shape)

token_dataset = BertTokenDataset(input_ids)
token_dataloader = torch.utils.data.DataLoader(token_dataset, shuffle=False, batch_size=32)

item_embeddings = []

for i, x in enumerate(token_dataloader):
    x = x.to(device)
    embedding = bert_model(x).pooler_output
    embedding = embedding.tolist()
    item_embeddings.extend(embedding)
    if (i+1) % 500 == 0:
        print("Step [{}/{}]" .format(i+1, len(token_dataloader)))

item_embeddings = np.array(item_embeddings)
print(item_embeddings.shape)

Some weights of the model checkpoint at ckiplab/bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dens

torch.Size([247092, 94])
Step [500/7722]
Step [1000/7722]
Step [1500/7722]
Step [2000/7722]
Step [2500/7722]
Step [3000/7722]
Step [3500/7722]
Step [4000/7722]
Step [4500/7722]
Step [5000/7722]
Step [5500/7722]
Step [6000/7722]
Step [6500/7722]
Step [7000/7722]
Step [7500/7722]
(247092, 768)


In [23]:
id2name = prod[prod.name.isin(item_names)][['id','name']].drop_duplicates()
pd.to_pickle(id2name, 'data/id2name.pkl')

In [22]:
item_names_df = pd.DataFrame(item_names, columns = ['name'])
item_names_df.merge(id2name, how='left', on='name').name.value_counts() # 一個商品名稱有多個id

316不鏽鋼陶晶煎烤盤(34公分)*1個                                                                   93
西班牙米其林主廚系列4件組*1組                                                                       79
316不鏽鋼漏勺*1支                                                                            58
(L-胖丁+1)老北雞腿*1支                                                                        42
316七層深型油炸鍋*1個(20公分)                                                                    41
                                                                                       ..
舊三郎 0501 A12-(30片)白松露奇蹟撫紋面膜(銀)*30                                                       1
叫賣 0328 A04-(5罐)台灣一條根 痛快噴 (120ml)*5                                                     1
888 1219 A09-日本味王桑葉有酵錠(90粒/包)(效期2022/3)*1                                               1
公主 0625 G12-刨絲器(顏色隨機)*1                                                                 1
公主 0315 A14-(3件組)雅詩蘭黛 Pro極速緊緻肌密全能精華100ml(效期2023/10)*1+SK-II 亮采化妝水230ml(效期2024/07)*2     1
Name: name

In [24]:
item2vec = {name:vec for name, vec in zip(item_names, item_embeddings)}

In [25]:
list(item2vec.keys())[:5]

['第 121 標(11/27)創意車載汽車儲網-約19X7cm(KMS6888)一個29元',
 '大師兄 0619 A12-卡通涼感凝膠冰墊(款式隨機)*1--刪單不通知',
 '第 045 標(10/27)2024年4月三太子一條根滾珠凝露(可選)-涼&熱-擇1-40G一罐29元',
 '888 0209 G06-(5條)JUL-怡百麗牆面修補膏(250g)+嘴+刮刀*5',
 "公主 1010 A25-(4件)Cab's SHOWERHEAD 黑金一鍵止水萬向蓮蓬頭*1+（-濾心-） 黑金一鍵止水萬向蓮蓬頭*3"]

In [26]:
item2vec['【商城】澳洲Eureka天然10%水溶性尤加利精油500ml'].shape

(768,)

In [18]:
with open('data/item2vec.pkl', 'wb') as f:
    pickle.dump(item2vec, f)

In [34]:
test['【商城】澳洲Eureka天然10%水溶性尤加利精油500ml'].shape

(768,)

In [None]:
with open('data/item2vec.pkl', 'rb') as f:
    test = pickle.load(f)
from deepdiff import DeepDiff
print(DeepDiff(test, item2vec))