Skip to content

Commit

Permalink
update examples
Browse files Browse the repository at this point in the history
  • Loading branch information
shenweichen committed Jul 18, 2021
1 parent 825bba1 commit 95ad62e
Show file tree
Hide file tree
Showing 14 changed files with 103 additions and 91 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ If you find this code useful in your research, please cite it using the followin
<td>
<a href="https://github.com/zanshuxun"><img width="70" height="70" src="https://github.com/zanshuxun.png?s=40" alt="pic"></a><br>
<a href="https://github.com/zanshuxun">Zan Shuxun</a> ​
<p>Beijing University <br> of Posts and <br> Telecommunications </p>​
<p>Alibaba Group </p>​
</td>
<td>
​ <a href="https://github.com/pandeconscious"><img width="70" height="70" src="https://github.com/pandeconscious.png?s=40" alt="pic"></a><br>
Expand Down
2 changes: 1 addition & 1 deletion deepctr/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .utils import check_version

__version__ = '0.8.6'
__version__ = '0.8.7'
check_version(__version__)
2 changes: 1 addition & 1 deletion deepctr/layers/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def call(self, inputs, mask=None, training=None, **kwargs):
try:
outputs = tf.matrix_set_diag(outputs, tf.ones_like(outputs)[
:, :, 0] * (-2 ** 32 + 1))
except AttributeError as e:
except AttributeError:
outputs = tf.compat.v1.matrix_set_diag(outputs, tf.ones_like(outputs)[
:, :, 0] * (-2 ** 32 + 1))

Expand Down
66 changes: 66 additions & 0 deletions docs/source/Examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,72 @@ if __name__ == "__main__":
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
```
## Hash Layer with pre-defined key-value vocabulary

This examples how to use pre-defined key-value vocabulary in `Hash` Layer.`movielens_age_vocabulary.csv` stores the key-value mapping for `age` feature.

```python
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
import numpy as np
import pandas as pd
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

try:
import tensorflow.compat.v1 as tf
except ImportError as e:
import tensorflow as tf

if __name__ == "__main__":
data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]

data[sparse_features] = data[sparse_features].astype(str)
target = ['rating']

# 1.Use hashing encoding on the fly for sparse features,and process sequence features

genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

# Notice : padding=`post`
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)

# 2.set hashing space for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True,
vocabulary_path='./movielens_age_vocabulary.csv' if feat == 'age' else None,
dtype='string')
for feat in sparse_features]
varlen_feature_columns = [
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4,
use_hash=True, dtype="string"),
maxlen=max_len, combiner='mean',
)] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}
model_input['genres'] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
with tf.Session() as sess:
sess.run(tf.tables_initializer())
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
else:
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

```


## Estimator with TFRecord: Classification Criteo

Expand Down
4 changes: 2 additions & 2 deletions docs/source/Features.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ DNN based CTR prediction models usually have following 4 modules:
``SparseFeat`` is a namedtuple with signature ``SparseFeat(name, vocabulary_size, embedding_dim, use_hash, vocabulary_path, dtype, embeddings_initializer, embedding_name, group_name, trainable)``

- name : feature name
- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
- vocabulary_size : number of unique feature values for sparse feature or hashing space when `use_hash=True`
- embedding_dim : embedding dimension
- use_hash : default `False`.If `True` the input will be hashed to space of size `vocabulary_size`.
- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns seperated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
- vocabulary_path : default `None`. The `CSV` text file path of the vocabulary table used by `tf.lookup.TextFileInitializer`, which assigns one entry in the table for each line in the file. One entry contains two columns separated by comma, the first is the value column, the second is the key column. The `0` value is reserved to use if a key is missing in the table, so hash value need start from `1`.
- dtype : default `int32`.dtype of input tensor.
- embeddings_initializer : initializer for the `embeddings` matrix.
- embedding_name : default `None`. If None, the embedding_name will be same as `name`.
Expand Down
1 change: 1 addition & 0 deletions docs/source/History.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# History
- 07/18/2021 : [v0.8.7](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7) released.Support pre-defined key-value vocabulary in `Hash` Layer. [example](./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary)
- 06/14/2021 : [v0.8.6](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.6) released.Add [IFM](./Features.html#ifm-input-aware-factorization-machine) [DIFM](./Features.html#difm-dual-input-aware-factorization-machine), [FEFM and DeepFEFM](./Features.html#deepfefm-deep-field-embedded-factorization-machine) model.
- 03/13/2021 : [v0.8.5](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.5) released.Add [BST](./Features.html#bst-behavior-sequence-transformer) model.
- 02/12/2021 : [v0.8.4](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.4) released.Fix bug in DCN-Mix.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/Quick-Start.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,
```
- Feature Hashing on the fly
```python
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1e6,embedding_dim=4, use_hash=True, dtype='string') # since the input is string
fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=1e6,embedding_dim=4, use_hash=True, dtype='string') # the input is string
for feat in sparse_features] + [DenseFeat(feat, 1, )
for feat in dense_features]
```
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.8.6'
release = '0.8.7'


# -- General configuration ---------------------------------------------------
Expand Down
4 changes: 2 additions & 2 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ You can read the latest code and related projects

News
-----
07/18/2021 : Support pre-defined key-value vocabulary in `Hash` Layer. `example <./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary>`_ `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7>`_

06/14/2021 : Add `IFM <./Features.html#ifm-input-aware-factorization-machine>`_ , `DIFM <./Features.html#difm-dual-input-aware-factorization-machine>`_ and `DeepFEFM <./Features.html#deepfefm-deep-field-embedded-factorization-machine>`_ . `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.6>`_

03/13/2021 : Add `BST <./Features.html#bst-behavior-sequence-transformer>`_ . `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.5>`_

02/12/2021 : Fix bug in DCN-Mix. `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.4>`_

DisscussionGroup
-----------------------

Expand Down
7 changes: 7 additions & 0 deletions examples/movielens_age_vocabulary.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
1,1
2,18
3,25
4,35
5,45
6,50
7,56
80 changes: 7 additions & 73 deletions examples/run_multivalue_movielens_vocab_hash.py
Original file line number Diff line number Diff line change
@@ -1,81 +1,16 @@
from deepctr.models import DeepFM
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, get_feature_names
import functools
import os
import numpy as np
import pandas as pd
import shutil
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

try:
import tensorflow.compat.v1 as tf
except ImportError as e:
import tensorflow as tf


def init_vocab(df, tmpdir):
"""initialize the vacabulary file of the sparse features
"""
vocab_size = {}

df_user_id = df.user_id.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
df_user_id.index += 1
df_user_id.to_csv(f'{tmpdir}/user_id.csv', sep=',', index=True, header=False)
# must set to vocabulary size pluse 1, because 0 is used for miss of has and mask, same below
vocab_size['user_id'] = len(df_user_id) + 1

df_movie_id = df.movie_id.drop_duplicates().dropna().sort_values().reset_index().drop(
columns='index')
df_movie_id.index += 1
df_movie_id.to_csv(f'{tmpdir}/movie_id.csv', sep=',', index=True, header=False)
vocab_size['movie_id'] = len(df_movie_id) + 1

df_genre = pd.DataFrame({
'genre': list(set(functools.reduce(lambda x, y: x + y, df.genres.str.split('|'))))
}).genre.sort_values()
df_genre.index += 1
df_genre.to_csv(f'{tmpdir}/genre.csv', sep=',', index=True, header=False)
vocab_size['genre'] = len(df_genre) + 1

df_gender = df.gender.drop_duplicates().replace(
r'^\s*$', np.nan,
regex=True).dropna().sort_values().reset_index().drop(
columns='index')
df_gender.index += 1
df_gender.to_csv(f'{tmpdir}/gender.csv', sep=',', index=True, header=False)
vocab_size['gender'] = len(df_gender) + 1

df_age = df.age.drop_duplicates().dropna().sort_values().reset_index().drop(columns='index')
df_age.index += 1
df_age.to_csv(f'{tmpdir}/age.csv', sep=',', index=True, header=False)
vocab_size['age'] = len(df_age) + 1

df_occupation = df.occupation.drop_duplicates().replace(
r'^\s*$', np.nan,
regex=True).dropna().sort_values().reset_index().drop(
columns='index')
df_occupation.index += 1
df_occupation.to_csv(f'{tmpdir}/occupation.csv', sep=',', index=True, header=False)
vocab_size['occupation'] = len(df_occupation) + 1

df_zip = df.zip.drop_duplicates().replace(
r'^\s*$', np.nan,
regex=True).dropna().sort_values().reset_index().drop(columns='index')
df_zip.index += 1
df_zip.to_csv(f'{tmpdir}/zip.csv', sep=',', index=True, header=False)
vocab_size['zip'] = len(df_zip) + 1
return vocab_size


if __name__ == "__main__":
# change this to where the movielens dataset and work directory is
workdir = os.path.dirname(__file__)
data = pd.read_csv(f"{workdir}/movielens_sample.txt")

metadir = f'{workdir}/meta'
if not os.path.exists(metadir):
os.mkdir(metadir)
vocab_size = init_vocab(data, metadir)

data = pd.read_csv("./movielens_sample.txt")
sparse_features = ["movie_id", "user_id",
"gender", "age", "occupation", "zip", ]

Expand All @@ -93,10 +28,13 @@ def init_vocab(df, tmpdir):

# 2.set hashing space for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, vocab_size[feat], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/{feat}.csv', dtype='string')
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True,
vocabulary_path='./movielens_age_vocabulary.csv' if feat == 'age' else None,
dtype='string')
for feat in sparse_features]
varlen_feature_columns = [
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=vocab_size['genre'], embedding_dim=4, use_hash=True, vocabulary_path=f'{metadir}/genre.csv', dtype="string"),
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4,
use_hash=True, dtype="string"),
maxlen=max_len, combiner='mean',
)] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
Expand All @@ -118,7 +56,3 @@ def init_vocab(df, tmpdir):
else:
history = model.fit(model_input, data[target].values,
batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
if os.path.exists(metadir):
shutil.rmtree(metadir)

# %%
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

setuptools.setup(
name="deepctr",
version="0.8.6",
version="0.8.7",
author="Weichen Shen",
author_email="weichenswc@163.com",
description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with tensorflow 1.x and 2.x .",
Expand Down
7 changes: 4 additions & 3 deletions tests/feature_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@


def test_long_dense_vector():

feature_columns = [SparseFeat('user_id', 4, ), SparseFeat('item_id', 5, ), DenseFeat("pic_vec", 5)]
fixlen_feature_names = get_feature_names(feature_columns)

Expand All @@ -24,6 +23,8 @@ def test_long_dense_vector():
def test_feature_column_sparsefeat_vocabulary_path():
vocab_path = "./dummy_test.csv"
sf = SparseFeat('user_id', 4, vocabulary_path=vocab_path)
assert sf.vocabulary_path == vocab_path
if sf.vocabulary_path != vocab_path:
raise ValueError("sf.vocabulary_path is invalid")
vlsf = VarLenSparseFeat(sf, 6)
assert vlsf.vocabulary_path == vocab_path
if vlsf.vocabulary_path != vocab_path:
raise ValueError("vlsf.vocabulary_path is invalid")
13 changes: 8 additions & 5 deletions tests/layers/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,28 @@
import tensorflow as tf
from deepctr.layers.utils import Hash
from tests.utils import layer_test

try:
from tensorflow.python.keras.utils import CustomObjectScope
except:
except ImportError:
from tensorflow.keras.utils import CustomObjectScope


@pytest.mark.parametrize(
'num_buckets,mask_zero,vocabulary_path,input_data,expected_output',
[
(3+1, False, None, ['lakemerson'], None),
(3+1, True, None, ['lakemerson'], None),
(3+1, False, "./tests/layers/vocabulary_example.csv", [['lake'], ['johnson'], ['lakemerson']], [[1], [3], [0]])
(3 + 1, False, None, ['lakemerson'], None),
(3 + 1, True, None, ['lakemerson'], None),
(
3 + 1, False, "./tests/layers/vocabulary_example.csv", [['lake'], ['johnson'], ['lakemerson']], [[1], [3], [0]])
]
)
def test_Hash(num_buckets, mask_zero, vocabulary_path, input_data, expected_output):
if not hasattr(tf, 'version') or tf.version.VERSION < '2.0.0':
return

with CustomObjectScope({'Hash': Hash}):
layer_test(Hash, kwargs={'num_buckets': num_buckets, 'mask_zero': mask_zero, 'vocabulary_path': vocabulary_path},
layer_test(Hash,
kwargs={'num_buckets': num_buckets, 'mask_zero': mask_zero, 'vocabulary_path': vocabulary_path},
input_dtype=tf.string, input_data=np.array(input_data, dtype='str'),
expected_output_dtype=tf.int64, expected_output=expected_output)

0 comments on commit 95ad62e

Please sign in to comment.