update examples

shenweichen · Jul 18, 2021 · 95ad62e · 95ad62e
1 parent 825bba1
commit 95ad62e
Show file tree

Hide file tree

Showing 14 changed files with 103 additions and 91 deletions.
diff --git a/README.md b/README.md
@@ -101,7 +101,7 @@ If you find this code useful in your research, please cite it using the followin
       <td>
          <a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
          <a href="https://github.com/zanshuxun">Zan Shuxun</a> 
-        <p>Beijing University <br> of  Posts and <br> Telecommunications  </p>
+        <p>Alibaba Group  </p>
       </td>
       <td>
          <a href="https://github.com/pandeconscious"><img width="70" height="70" src="https://github.com/pandeconscious.png?s=40" alt="pic"></a><br>

diff --git a/deepctr/__init__.py b/deepctr/__init__.py
@@ -1,4 +1,4 @@
 from .utils import check_version
 
-__version__ = '0.8.6'
+__version__ = '0.8.7'
 check_version(__version__)
diff --git a/deepctr/layers/sequence.py b/deepctr/layers/sequence.py
@@ -561,7 +561,7 @@ def call(self, inputs, mask=None, training=None, **kwargs):
             try:
                 outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
                     :, :, 0] * (-2 ** 32 + 1))
-            except AttributeError as e:
+            except AttributeError:
                 outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
                     :, :, 0] * (-2 ** 32 + 1))
 

diff --git a/docs/source/Examples.md b/docs/source/Examples.md
@@ -322,6 +322,72 @@ if __name__ == "__main__":
     history = model.fit(model_input, data[target].values,
                         batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
 ```
+## Hash Layer with pre-defined key-value vocabulary 
+
+This examples how to use pre-defined key-value vocabulary in `Hash` Layer.`movielens_age_vocabulary.csv` stores the key-value mapping for `age` feature.
+
+```python
+from deepctr.models import DeepFM
+from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
+import numpy as np
+import pandas as pd
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
+try:
+    import tensorflow.compat.v1 as tf
+except ImportError as e:
+    import tensorflow as tf
+
+if __name__ == "__main__":
+    data = pd.read_csv("./movielens_sample.txt")
+    sparse_features = ["movie_id", "user_id",
+                       "gender", "age", "occupation", "zip", ]
+
+    data[sparse_features] = data[sparse_features].astype(str)
+    target = ['rating']
+
+    # 1.Use hashing encoding on the fly for sparse features,and process sequence features
+
+    genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
+    genres_length = np.array(list(map(len, genres_list)))
+    max_len = max(genres_length)
+
+    # Notice : padding=`post`
+    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
+
+    # 2.set hashing space for each sparse field and generate feature config for sequence feature
+
+    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True,
+                                         vocabulary_path='./movielens_age_vocabulary.csv' if feat == 'age' else None,
+                                         dtype='string')
+                              for feat in sparse_features]
+    varlen_feature_columns = [
+        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4,
+                                    use_hash=True, dtype="string"),
+                         maxlen=max_len, combiner='mean',
+                         )]  # Notice : value 0 is for padding for sequence input feature
+    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
+    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
+
+    # 3.generate input data for model
+    model_input = {name: data[name] for name in feature_names}
+    model_input['genres'] = genres_list
+
+    # 4.Define Model,compile and train
+    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
+    model.compile("adam", "mse", metrics=['mse'], )
+    if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
+        with tf.Session() as sess:
+            sess.run(tf.tables_initializer())
+            history = model.fit(model_input, data[target].values,
+                                batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
+    else:
+        history = model.fit(model_input, data[target].values,
+                            batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
+
+```
+
 
 ## Estimator with TFRecord: Classification Criteo 
 

diff --git a/docs/source/Features.md b/docs/source/Features.md
@@ -26,10 +26,10 @@ DNN based CTR prediction models usually have following 4 modules:
 ``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype, embeddings_initializer, embedding_name, group_name, trainable)``
 
 - name : feature name
-- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
+- vocabulary_size : number of unique feature values for sparse feature or hashing space when `use_hash=True`
 - embedding_dim : embedding dimension
 - use_hash : default `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
-- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns seperated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
+- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns separated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
 - dtype : default `int32`.dtype of input tensor.
 - embeddings_initializer : initializer for the `embeddings` matrix.
 - embedding_name : default `None`. If None, the embedding_name will be same as `name`.

diff --git a/docs/source/History.md b/docs/source/History.md
@@ -1,4 +1,5 @@
 # History
+- 07/18/2021 : [v0.8.7](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7) released.Support pre-defined key-value vocabulary in `Hash` Layer. [example](./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary)
 - 06/14/2021 : [v0.8.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.6) released.Add [IFM](./Features.html#ifm-input-aware-factorization-machine) [DIFM](./Features.html#difm-dual-input-aware-factorization-machine), [FEFM and DeepFEFM](./Features.html#deepfefm-deep-field-embedded-factorization-machine) model.
 - 03/13/2021 : [v0.8.5](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.5) released.Add [BST](./Features.html#bst-behavior-sequence-transformer) model.
 - 02/12/2021 : [v0.8.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.4) released.Fix bug in DCN-Mix. 

diff --git a/docs/source/Quick-Start.md b/docs/source/Quick-Start.md
@@ -86,7 +86,7 @@ fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,
 ```
 - Feature Hashing on the fly
 ```python
-fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1e6,embedding_dim=4, use_hash=True, dtype='string')  # since the input is string
+fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1e6,embedding_dim=4, use_hash=True, dtype='string')  # the input is string
                               for feat in sparse_features] + [DenseFeat(feat, 1, )
                           for feat in dense_features]
 ```

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.8.6'
+release = '0.8.7'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -42,12 +42,12 @@ You can read the latest code and related projects
 
 News
 -----
+07/18/2021 : Support pre-defined key-value vocabulary in `Hash` Layer. `example <./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary>`_ `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7>`_
+
 06/14/2021 : Add `IFM <./Features.html#ifm-input-aware-factorization-machine>`_ , `DIFM <./Features.html#difm-dual-input-aware-factorization-machine>`_ and `DeepFEFM <./Features.html#deepfefm-deep-field-embedded-factorization-machine>`_ . `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.6>`_
 
 03/13/2021 : Add `BST <./Features.html#bst-behavior-sequence-transformer>`_ . `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.5>`_
 
-02/12/2021 : Fix bug in DCN-Mix. `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.4>`_
-
 DisscussionGroup
 -----------------------
 

diff --git a/examples/movielens_age_vocabulary.csv b/examples/movielens_age_vocabulary.csv
@@ -0,0 +1,7 @@
+1,1
+2,18
+3,25
+4,35
+5,45
+6,50
+7,56
diff --git a/examples/run_multivalue_movielens_vocab_hash.py b/examples/run_multivalue_movielens_vocab_hash.py
@@ -1,81 +1,16 @@
 from deepctr.models import DeepFM
 from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
-import functools
-import os
 import numpy as np
 import pandas as pd
-import shutil
 from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+
 try:
     import tensorflow.compat.v1 as tf
 except ImportError as e:
     import tensorflow as tf
 
-
-def init_vocab(df, tmpdir):
-    """initialize the vacabulary file of the sparse features
-    """
-    vocab_size = {}
-
-    df_user_id = df.user_id.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
-    df_user_id.index += 1
-    df_user_id.to_csv(f'{tmpdir}/user_id.csv', sep=',', index=True, header=False)
-    # must set to vocabulary size pluse 1, because 0 is used for miss of has and mask, same below
-    vocab_size['user_id'] = len(df_user_id) + 1
-
-    df_movie_id = df.movie_id.drop_duplicates().dropna().sort_values().reset_index().drop(
-        columns='index')
-    df_movie_id.index += 1
-    df_movie_id.to_csv(f'{tmpdir}/movie_id.csv', sep=',', index=True, header=False)
-    vocab_size['movie_id'] = len(df_movie_id) + 1
-
-    df_genre = pd.DataFrame({
-        'genre': list(set(functools.reduce(lambda x, y: x + y, df.genres.str.split('|'))))
-    }).genre.sort_values()
-    df_genre.index += 1
-    df_genre.to_csv(f'{tmpdir}/genre.csv', sep=',', index=True, header=False)
-    vocab_size['genre'] = len(df_genre) + 1
-
-    df_gender = df.gender.drop_duplicates().replace(
-        r'^\s*$', np.nan,
-        regex=True).dropna().sort_values().reset_index().drop(
-            columns='index')
-    df_gender.index += 1
-    df_gender.to_csv(f'{tmpdir}/gender.csv', sep=',', index=True, header=False)
-    vocab_size['gender'] = len(df_gender) + 1
-
-    df_age = df.age.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
-    df_age.index += 1
-    df_age.to_csv(f'{tmpdir}/age.csv', sep=',', index=True, header=False)
-    vocab_size['age'] = len(df_age) + 1
-
-    df_occupation = df.occupation.drop_duplicates().replace(
-        r'^\s*$', np.nan,
-        regex=True).dropna().sort_values().reset_index().drop(
-            columns='index')
-    df_occupation.index += 1
-    df_occupation.to_csv(f'{tmpdir}/occupation.csv', sep=',', index=True, header=False)
-    vocab_size['occupation'] = len(df_occupation) + 1
-
-    df_zip = df.zip.drop_duplicates().replace(
-        r'^\s*$', np.nan,
-        regex=True).dropna().sort_values().reset_index().drop(columns='index')
-    df_zip.index += 1
-    df_zip.to_csv(f'{tmpdir}/zip.csv', sep=',', index=True, header=False)
-    vocab_size['zip'] = len(df_zip) + 1
-    return vocab_size
-
-
 if __name__ == "__main__":
-    # change this to where the movielens dataset and work directory is
-    workdir = os.path.dirname(__file__)
-    data = pd.read_csv(f"{workdir}/movielens_sample.txt")
-
-    metadir = f'{workdir}/meta'
-    if not os.path.exists(metadir):
-        os.mkdir(metadir)
-    vocab_size = init_vocab(data, metadir)
-
+    data = pd.read_csv("./movielens_sample.txt")
     sparse_features = ["movie_id", "user_id",
                        "gender", "age", "occupation", "zip", ]
 
@@ -93,10 +28,13 @@ def init_vocab(df, tmpdir):
 
     # 2.set hashing space for each sparse field and generate feature config for sequence feature
 
-    fixlen_feature_columns = [SparseFeat(feat, vocab_size[feat], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/{feat}.csv', dtype='string')
+    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True,
+                                         vocabulary_path='./movielens_age_vocabulary.csv' if feat == 'age' else None,
+                                         dtype='string')
                               for feat in sparse_features]
     varlen_feature_columns = [
-        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=vocab_size['genre'], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/genre.csv', dtype="string"),
+        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4,
+                                    use_hash=True, dtype="string"),
                          maxlen=max_len, combiner='mean',
                          )]  # Notice : value 0 is for padding for sequence input feature
     linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
@@ -118,7 +56,3 @@ def init_vocab(df, tmpdir):
     else:
         history = model.fit(model_input, data[target].values,
                             batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
-    if os.path.exists(metadir):
-        shutil.rmtree(metadir)
-
-# %%
diff --git a/setup.py b/setup.py
@@ -9,7 +9,7 @@
 
 setuptools.setup(
     name="deepctr",
-    version="0.8.6",
+    version="0.8.7",
     author="Weichen Shen",
     author_email="weichenswc@163.com",
     description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with tensorflow 1.x and 2.x .",

diff --git a/tests/feature_test.py b/tests/feature_test.py
@@ -4,7 +4,6 @@
 
 
 def test_long_dense_vector():
-
     feature_columns = [SparseFeat('user_id', 4, ), SparseFeat('item_id', 5, ), DenseFeat("pic_vec", 5)]
     fixlen_feature_names = get_feature_names(feature_columns)
 
@@ -24,6 +23,8 @@ def test_long_dense_vector():
 def test_feature_column_sparsefeat_vocabulary_path():
     vocab_path = "./dummy_test.csv"
     sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path)
-    assert sf.vocabulary_path == vocab_path
+    if sf.vocabulary_path != vocab_path:
+        raise ValueError("sf.vocabulary_path is invalid")
     vlsf = VarLenSparseFeat(sf, 6)
-    assert vlsf.vocabulary_path == vocab_path
+    if vlsf.vocabulary_path != vocab_path:
+        raise ValueError("vlsf.vocabulary_path is invalid")
diff --git a/tests/layers/utils_test.py b/tests/layers/utils_test.py
@@ -3,25 +3,28 @@
 import tensorflow as tf
 from deepctr.layers.utils import Hash
 from tests.utils import layer_test
+
 try:
     from tensorflow.python.keras.utils import CustomObjectScope
-except:
+except ImportError:
     from tensorflow.keras.utils import CustomObjectScope
 
 
 @pytest.mark.parametrize(
     'num_buckets,mask_zero,vocabulary_path,input_data,expected_output',
     [
-        (3+1, False, None, ['lakemerson'], None),
-        (3+1, True, None, ['lakemerson'], None),
-        (3+1, False, "./tests/layers/vocabulary_example.csv", [['lake'], ['johnson'], ['lakemerson']], [[1], [3], [0]])
+        (3 + 1, False, None, ['lakemerson'], None),
+        (3 + 1, True, None, ['lakemerson'], None),
+        (
+        3 + 1, False, "./tests/layers/vocabulary_example.csv", [['lake'], ['johnson'], ['lakemerson']], [[1], [3], [0]])
     ]
 )
 def test_Hash(num_buckets, mask_zero, vocabulary_path, input_data, expected_output):
     if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
         return
 
     with CustomObjectScope({'Hash': Hash}):
-        layer_test(Hash, kwargs={'num_buckets': num_buckets, 'mask_zero': mask_zero, 'vocabulary_path': vocabulary_path},
+        layer_test(Hash,
+                   kwargs={'num_buckets': num_buckets, 'mask_zero': mask_zero, 'vocabulary_path': vocabulary_path},
                    input_dtype=tf.string, input_data=np.array(input_data, dtype='str'),
                    expected_output_dtype=tf.int64, expected_output=expected_output)