Fix bugs:

1. Add .h5 file extension for pretrained emb 2. Check the same dtype between pretrained keys and vocab keys 3. Performance issue in repeatly getting vocab_size()
reczoo · Jan 16, 2024 · 86e54c4 · 86e54c4
1 parent 1f0e793
commit 86e54c4
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@
 + [Edit] Rename FINAL model to FinalNet
 + [Edit] Update RecZoo URLs
 + [Fix] Fix bug #75
++ [Fix] Fix h5 file extenstion issue
 
 **FuxiCTR v2.1.2, 2023-11-01**
 + [Edit] Update H5DataBlockLoader to support dataloader with multiprocessing

diff --git a/fuxictr/preprocess/feature_processor.py b/fuxictr/preprocess/feature_processor.py
@@ -112,6 +112,7 @@ def fit(self, train_ddf, min_categr_count=1, num_buckets=10, **kwargs):
                 elif col["type"] == "numeric":
                     self.fit_numeric_col(col, train_ddf[name].values)
                 elif col["type"] == "categorical":
+
                     self.fit_categorical_col(col, train_ddf[name].values, 
                                              min_categr_count=min_categr_count,
                                              num_buckets=num_buckets)
@@ -130,7 +131,7 @@ def fit(self, train_ddf, min_categr_count=1, num_buckets=10, **kwargs):
                 if "pretrain_dim" in col:
                     self.feature_map.features[name]["pretrain_dim"] = col["pretrain_dim"]
                 shutil.copy(col["pretrained_emb"],
-                            os.path.join(self.data_dir, "pretrained_{}".format(name)))
+                            os.path.join(self.data_dir, "pretrained_{}.h5".format(name)))
                 self.feature_map.features[name]["pretrained_emb"] = "pretrained_{}.h5".format(name)
                 self.feature_map.features[name]["freeze_emb"] = col.get("freeze_emb", True)
                 self.feature_map.features[name]["pretrain_usage"] = col.get("pretrain_usage", "init")

diff --git a/fuxictr/preprocess/tokenizer.py b/fuxictr/preprocess/tokenizer.py
@@ -90,7 +90,7 @@ def merge_vocab(self, shared_tokenizer):
         return shared_tokenizer
 
     def vocab_size(self):
-        return max(self.vocab.values()) + 1
+        return max(self.vocab.values()) + 1 # In case that keys start from 1
 
     def update_vocab(self, word_list):
         new_words = 0
@@ -131,11 +131,13 @@ def load_pretrained_vocab(self, feature_dtype, pretrain_path, expand_vocab=True)
             keys = hf["key"][:]
             keys = keys.astype(feature_dtype) # in case mismatch of dtype between int and str
         # Update vocab with pretrained keys in case new tokens appear in validation or test set
-        # Do not update OOV index here since it is used in PretrainedEmbedding
+        # Do NOT update OOV index here since it is used in PretrainedEmbedding
         if expand_vocab:
+            vocab_size = self.vocab_size()
             for word in keys:
                 if word not in self.vocab:
-                    self.vocab[word] = self.vocab_size()
+                    self.vocab[word] = vocab_size
+                    vocab_size += 1
 
 
 def count_tokens(texts, splitter):

diff --git a/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py b/fuxictr/pytorch/layers/embeddings/pretrained_embedding.py
@@ -76,7 +76,8 @@ def get_pretrained_embedding(self, pretrain_path):
     def load_feature_vocab(self, vocab_path, feature_name):
         with io.open(vocab_path, "r", encoding="utf-8") as fd:
             vocab = json.load(fd)
-        return vocab[feature_name]
+            vocab_type = type(list(vocab.items())[1][0]) # get key dtype
+        return vocab[feature_name], vocab_type
 
     def load_pretrained_embedding(self, vocab_size, pretrain_dim, pretrain_path, vocab_path,
                                   feature_name, freeze=False, padding_idx=None):
@@ -91,7 +92,8 @@ def load_pretrained_embedding(self, vocab_size, pretrain_dim, pretrain_path, voc
                 embedding_matrix[padding_idx, :] = np.zeros(pretrain_dim) # set as zero for PAD
         keys, embeddings = self.get_pretrained_embedding(pretrain_path)
         assert embeddings.shape[-1] == pretrain_dim, f"pretrain_dim={pretrain_dim} not correct."
-        vocab = self.load_feature_vocab(vocab_path, feature_name)
+        vocab, vocab_type = self.load_feature_vocab(vocab_path, feature_name)
+        keys = keys.astype(vocab_type) # ensure the same dtype between pretrained keys and vocab keys
         for idx, word in enumerate(keys):
             if word in vocab:
                 embedding_matrix[vocab[word]] = embeddings[idx]