Skip to content

Commit

Permalink
Fix bugs:
Browse files Browse the repository at this point in the history
1. Add .h5 file extension for pretrained emb
2. Check the same dtype between pretrained keys and vocab keys
3. Performance issue in repeatly getting vocab_size()
  • Loading branch information
xpai committed Jan 16, 2024
1 parent 1f0e793 commit 86e54c4
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 6 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
+ [Edit] Rename FINAL model to FinalNet
+ [Edit] Update RecZoo URLs
+ [Fix] Fix bug #75
+ [Fix] Fix h5 file extenstion issue

**FuxiCTR v2.1.2, 2023-11-01**
+ [Edit] Update H5DataBlockLoader to support dataloader with multiprocessing
Expand Down
3 changes: 2 additions & 1 deletion fuxictr/preprocess/feature_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def fit(self, train_ddf, min_categr_count=1, num_buckets=10, **kwargs):
elif col["type"] == "numeric":
self.fit_numeric_col(col, train_ddf[name].values)
elif col["type"] == "categorical":

self.fit_categorical_col(col, train_ddf[name].values,
min_categr_count=min_categr_count,
num_buckets=num_buckets)
Expand All @@ -130,7 +131,7 @@ def fit(self, train_ddf, min_categr_count=1, num_buckets=10, **kwargs):
if "pretrain_dim" in col:
self.feature_map.features[name]["pretrain_dim"] = col["pretrain_dim"]
shutil.copy(col["pretrained_emb"],
os.path.join(self.data_dir, "pretrained_{}".format(name)))
os.path.join(self.data_dir, "pretrained_{}.h5".format(name)))
self.feature_map.features[name]["pretrained_emb"] = "pretrained_{}.h5".format(name)
self.feature_map.features[name]["freeze_emb"] = col.get("freeze_emb", True)
self.feature_map.features[name]["pretrain_usage"] = col.get("pretrain_usage", "init")
Expand Down
8 changes: 5 additions & 3 deletions fuxictr/preprocess/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def merge_vocab(self, shared_tokenizer):
return shared_tokenizer

def vocab_size(self):
return max(self.vocab.values()) + 1
return max(self.vocab.values()) + 1 # In case that keys start from 1

def update_vocab(self, word_list):
new_words = 0
Expand Down Expand Up @@ -131,11 +131,13 @@ def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True)
keys = hf["key"][:]
keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str
# Update vocab with pretrained keys in case new tokens appear in validation or test set
# Do not update OOV index here since it is used in PretrainedEmbedding
# Do NOT update OOV index here since it is used in PretrainedEmbedding
if expand_vocab:
vocab_size = self.vocab_size()
for word in keys:
if word not in self.vocab:
self.vocab[word] = self.vocab_size()
self.vocab[word] = vocab_size
vocab_size += 1


def count_tokens(texts, splitter):
Expand Down
6 changes: 4 additions & 2 deletions fuxictr/pytorch/layers/embeddings/pretrained_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def get_pretrained_embedding(self, pretrain_path):
def load_feature_vocab(self, vocab_path, feature_name):
with io.open(vocab_path, "r", encoding="utf-8") as fd:
vocab = json.load(fd)
return vocab[feature_name]
vocab_type = type(list(vocab.items())[1][0]) # get key dtype
return vocab[feature_name], vocab_type

def load_pretrained_embedding(self, vocab_size, pretrain_dim, pretrain_path, vocab_path,
feature_name, freeze=False, padding_idx=None):
Expand All @@ -91,7 +92,8 @@ def load_pretrained_embedding(self, vocab_size, pretrain_dim, pretrain_path, voc
embedding_matrix[padding_idx, :] = np.zeros(pretrain_dim) # set as zero for PAD
keys, embeddings = self.get_pretrained_embedding(pretrain_path)
assert embeddings.shape[-1] == pretrain_dim, f"pretrain_dim={pretrain_dim} not correct."
vocab = self.load_feature_vocab(vocab_path, feature_name)
vocab, vocab_type = self.load_feature_vocab(vocab_path, feature_name)
keys = keys.astype(vocab_type) # ensure the same dtype between pretrained keys and vocab keys
for idx, word in enumerate(keys):
if word in vocab:
embedding_matrix[vocab[word]] = embeddings[idx]
Expand Down

0 comments on commit 86e54c4

Please sign in to comment.