v0.7.1

* Simplify `VarLenSparseFeat`, support setting weight_normalization. * Fix problem of embedding size of `SparseFeat` in `linear_feature_columns`.
shenweichen · Jan 28, 2020 · dcf583f · dcf583f
1 parent db229dc
commit dcf583f
Show file tree

Hide file tree

Showing 22 changed files with 129 additions and 109 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -20,7 +20,7 @@ Steps to reproduce the behavior:
 **Operating environment(运行环境):**
  - python version [e.g. 3.4, 3.6]
  - tensorflow version [e.g. 1.4.0, 1.12.0]
- - deepctr version [e.g. 0.5.2,]
+ - deepctr version [e.g. 0.7.1,]
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
@@ -17,4 +17,4 @@ Add any other context about the problem here.
 **Operating environment(运行环境):**
  - python version [e.g. 3.6]
  - tensorflow version [e.g. 1.4.0,]
- - deepctr version [e.g. 0.5.2,]
+ - deepctr version [e.g. 0.7.1,]
diff --git a/deepctr/__init__.py b/deepctr/__init__.py
@@ -1,4 +1,4 @@
 from .utils import check_version
 
-__version__ = '0.7.0'
+__version__ = '0.7.1'
 check_version(__version__)
diff --git a/deepctr/inputs.py b/deepctr/inputs.py
@@ -14,67 +14,74 @@
 from tensorflow.python.keras.regularizers import l2
 
 from .layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
-from .layers.utils import Hash, concat_func, Linear,add_func
+from .layers.utils import Hash, concat_func, Linear, add_func
 
 DEFAULT_GROUP_NAME = "default_group"
 
 
 class SparseFeat(namedtuple('SparseFeat',
-                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embedding_name', 'group_name'])):
+                            ['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embedding_name',
+                             'group_name'])):
     __slots__ = ()
 
     def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="int32", embedding_name=None,
                 group_name=DEFAULT_GROUP_NAME):
         if embedding_name is None:
             embedding_name = name
         if embedding_dim == "auto":
-            embedding_dim =  6 * int(pow(vocabulary_size, 0.25))
+            embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
         return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
                                               embedding_name, group_name)
 
     def __hash__(self):
         return self.name.__hash__()
 
-    # def __eq__(self, other):
-    #     if self.name == other.name and self.embedding_name == other.embedding_name:
-    #         return True
-    #     return False
 
-    # def __repr__(self):
-    #     return 'SparseFeat:'+self.name
+class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
+                                  ['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
+    __slots__ = ()
 
+    def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
+        return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
+                                                    weight_norm)
 
-class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
-    __slots__ = ()
+    @property
+    def name(self):
+        return self.sparsefeat.name
 
-    def __new__(cls, name, dimension=1, dtype="float32"):
-        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
+    @property
+    def vocabulary_size(self):
+        return self.sparsefeat.vocabulary_size
 
-    def __hash__(self):
-        return self.name.__hash__()
+    @property
+    def embedding_dim(self):
+        return self.sparsefeat.embedding_dim
 
-    # def __eq__(self, other):
-    #     if self.name == other.name:
-    #         return True
-    #     return False
+    @property
+    def use_hash(self):
+        return self.sparsefeat.use_hash
 
-    # def __repr__(self):
-    #     return 'DenseFeat:'+self.name
+    @property
+    def dtype(self):
+        return self.sparsefeat.dtype
 
+    @property
+    def embedding_name(self):
+        return self.sparsefeat.embedding_name
 
-class VarLenSparseFeat(namedtuple('VarLenFeat',
-                                  ['name', 'maxlen', 'vocabulary_size', 'embedding_dim', 'combiner', 'use_hash',
-                                   'dtype','length_name' ,'weight_name', 'embedding_name', 'group_name'])):
+    @property
+    def group_name(self):
+        return self.sparsefeat.group_name
+
+    def __hash__(self):
+        return self.name.__hash__()
+
+
+class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
     __slots__ = ()
 
-    def __new__(cls, name, maxlen, vocabulary_size, embedding_dim=4, combiner="mean", use_hash=False, dtype="float32",
-                length_name=None, weight_name=None, embedding_name=None, group_name=DEFAULT_GROUP_NAME):
-        if embedding_name is None:
-            embedding_name = name
-        if embedding_dim == "auto":
-            embedding_dim =  6 * int(pow(vocabulary_size, 0.25))
-        return super(VarLenSparseFeat, cls).__new__(cls, name, maxlen, vocabulary_size, embedding_dim, combiner,
-                                                    use_hash, dtype, length_name,weight_name, embedding_name, group_name)
+    def __new__(cls, name, dimension=1, dtype="float32"):
+        return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
 
     def __hash__(self):
         return self.name.__hash__()
@@ -85,7 +92,7 @@ def __hash__(self):
     #     return False
 
     # def __repr__(self):
-    #     return 'VarLenSparseFeat:'+self.name
+    #     return 'DenseFeat:'+self.name
 
 
 def get_feature_names(feature_columns):
@@ -111,9 +118,9 @@ def build_input_features(feature_columns, prefix=''):
                                             dtype=fc.dtype)
             if fc.weight_name is not None:
                 input_features[fc.weight_name] = Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
-                                                     dtype="float32")
+                                                       dtype="float32")
             if fc.length_name is not None:
-                input_features[fc.length_name] = Input((1,),name=prefix+fc.length_name,dtype='int32')
+                input_features[fc.length_name] = Input((1,), name=prefix + fc.length_name, dtype='int32')
 
         else:
             raise TypeError("Invalid feature column type,got", type(fc))
@@ -123,12 +130,12 @@ def build_input_features(feature_columns, prefix=''):
 
 def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, init_std, seed, l2_reg,
                           prefix='sparse_', seq_mask_zero=True):
-    sparse_embedding = {feat.embedding_name:Embedding(feat.vocabulary_size, feat.embedding_dim,
-                                                          embeddings_initializer=RandomNormal(
-                                                              mean=0.0, stddev=init_std, seed=seed),
-                                                          embeddings_regularizer=l2(l2_reg),
-                                                          name=prefix + '_emb_' + feat.embedding_name) for feat in sparse_feature_columns}
-
+    sparse_embedding = {feat.embedding_name: Embedding(feat.vocabulary_size, feat.embedding_dim,
+                                                       embeddings_initializer=RandomNormal(
+                                                           mean=0.0, stddev=init_std, seed=seed),
+                                                       embeddings_regularizer=l2(l2_reg),
+                                                       name=prefix + '_emb_' + feat.embedding_name) for feat in
+                        sparse_feature_columns}
 
     if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
         for feat in varlen_sparse_feature_columns:
@@ -160,7 +167,7 @@ def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, r
 
 def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, prefix="", seq_mask_zero=True):
     sparse_feature_columns = list(
-        filter(lambda x: isinstance(x, SparseFeat) , feature_columns)) if feature_columns else []
+        filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
     varlen_sparse_feature_columns = list(
         filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
     sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, init_std, seed,
@@ -170,25 +177,32 @@ def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, prefix="",
 
 def get_linear_logit(features, feature_columns, units=1, use_bias=False, init_std=0.0001, seed=1024, prefix='linear',
                      l2_reg=0):
+    for i in range(len(feature_columns)):
+        if isinstance(feature_columns[i], SparseFeat):
+            feature_columns[i] = feature_columns[i]._replace(embedding_dim=1)
+        if isinstance(feature_columns[i], VarLenSparseFeat):
+            feature_columns[i] = feature_columns[i]._replace(
+                sparsefeat=feature_columns[i].sparsefeat._replace(embedding_dim=1))
+
     linear_emb_list = [input_from_feature_columns(features, feature_columns, l2_reg, init_std, seed,
                                                   prefix=prefix + str(i))[0] for i in range(units)]
     _, dense_input_list = input_from_feature_columns(features, feature_columns, l2_reg, init_std, seed, prefix=prefix)
 
     linear_logit_list = []
     for i in range(units):
 
-        if len(linear_emb_list[0]) > 0 and len(dense_input_list) > 0:
+        if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
             sparse_input = concat_func(linear_emb_list[i])
             dense_input = concat_func(dense_input_list)
             linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias)([sparse_input, dense_input])
-        elif len(linear_emb_list[0]) > 0:
+        elif len(linear_emb_list[i]) > 0:
             sparse_input = concat_func(linear_emb_list[i])
             linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias)(sparse_input)
         elif len(dense_input_list) > 0:
             dense_input = concat_func(dense_input_list)
             linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias)(dense_input)
         else:
-            #raise NotImplementedError
+            # raise NotImplementedError
             return add_func([])
         linear_logit_list.append(linear_logit)
 
@@ -235,15 +249,15 @@ def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_colu
         feature_length_name = fc.length_name
         if feature_length_name is not None:
             if fc.weight_name is not None:
-                seq_input = WeightedSequenceLayer()(
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
                     [embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
             else:
                 seq_input = embedding_dict[feature_name]
             vec = SequencePoolingLayer(combiner, supports_masking=False)(
                 [seq_input, features[feature_length_name]])
         else:
             if fc.weight_name is not None:
-                seq_input = WeightedSequenceLayer(supports_masking=True)(
+                seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
                     [embedding_dict[feature_name], features[fc.weight_name]])
             else:
                 seq_input = embedding_dict[feature_name]
@@ -254,6 +268,7 @@ def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_colu
             return chain.from_iterable(pooling_vec_list.values())
     return pooling_vec_list
 
+
 def get_dense_input(features, feature_columns):
     dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
     dense_input_list = []

diff --git a/deepctr/layers/core.py b/deepctr/layers/core.py
@@ -66,7 +66,7 @@ def build(self, input_shape):
         if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
             raise ValueError('A `LocalActivationUnit` layer requires '
                              'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
-                             'Got different shapes: %s,%s' % (input_shape))
+                             'Got different shapes: %s,%s' % (input_shape[0],input_shape[1]))
         size = 4 * \
                int(input_shape[0][-1]
                    ) if len(self.hidden_units) == 0 else self.hidden_units[-1]

diff --git a/deepctr/layers/sequence.py b/deepctr/layers/sequence.py
@@ -121,12 +121,12 @@ class WeightedSequenceLayer(Layer):
         - 3D tensor with shape: ``(batch_size, T, embedding_size)``.
 
       Arguments
-        - **weight_normalization**: bool.Whether normalize the weight socre before applying to sequence.
+        - **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.
 
         - **supports_masking**:If True,the input need to support masking.
     """
 
-    def __init__(self,weight_normalization=False, supports_masking=False, **kwargs):
+    def __init__(self,weight_normalization=True, supports_masking=False, **kwargs):
         super(WeightedSequenceLayer, self).__init__(**kwargs)
         self.weight_normalization = weight_normalization
         self.supports_masking = supports_masking

diff --git a/deepctr/utils.py b/deepctr/utils.py
@@ -40,7 +40,7 @@ def check(version):
                         '\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
                             latest_version, version))
         except Exception as e:
-            print(e)
+            print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
             return
 
     Thread(target=check, args=(version,)).start()

diff --git a/docs/source/Examples.md b/docs/source/Examples.md
@@ -246,11 +246,11 @@ if __name__ == "__main__":
 
     use_weighted_sequence = False
     if use_weighted_sequence:
-        varlen_feature_columns = [VarLenSparseFeat('genres', maxlen= max_len,vocabulary_size=len(
-            key2index) + 1,embedding_dim=4, combiner='mean',weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',vocabulary_size=len(
+            key2index) + 1,embedding_dim=4), maxlen= max_len, combiner='mean',weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
     else:
-        varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size= len(
-            key2index) + 1,embedding_dim=4, combiner='mean',weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
+        varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',vocabulary_size= len(
+            key2index) + 1,embedding_dim=4), maxlen=max_len, combiner='mean',weight_name=None)]  # Notice : value 0 is for padding for sequence input feature
 
     linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
     dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
@@ -279,8 +279,8 @@ import numpy as np
 import pandas as pd
 from tensorflow.python.keras.preprocessing.sequence import pad_sequences
 
+from deepctr.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
 from deepctr.models import DeepFM
-from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names
 
 if __name__ == "__main__":
     data = pd.read_csv("./movielens_sample.txt")
@@ -301,20 +301,22 @@ if __name__ == "__main__":
 
     # 2.set hashing space for each sparse field and generate feature config for sequence feature
 
-    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5,embedding_dim=4, use_hash=True, dtype='string')
+    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
                               for feat in sparse_features]
-    varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size=100,embedding_dim=4,combiner= 'mean', use_hash=True,
-                                               dtype="string")]  # Notice : value 0 is for padding for sequence input feature
+    varlen_feature_columns = [
+        VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
+                         maxlen=max_len, combiner='mean',
+                         )]  # Notice : value 0 is for padding for sequence input feature
     linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
     dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
     feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
 
     # 3.generate input data for model
-    model_input = {name:data[name] for name in feature_names}
+    model_input = {name: data[name] for name in feature_names}
     model_input['genres'] = genres_list
 
     # 4.Define Model,compile and train
-    model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
+    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
 
     model.compile("adam", "mse", metrics=['mse'], )
     history = model.fit(model_input, data[target].values,

diff --git a/docs/source/Features.md b/docs/source/Features.md
@@ -42,19 +42,14 @@ DNN based CTR prediction models usually have following 4 modules:
 
 ### VarLenSparseFeat
 
-``VarLenSparseFeat`` is a namedtuple with signature ``VarLenSparseFeat(name, maxlen, vocabulary_size, embedding_dim, combiner,use_hash, dtype, length_name,weight_name, embedding_name, group_name)``
+``VarLenSparseFeat`` is a namedtuple with signature ``VarLenSparseFeat(sparsefeat, maxlen, combiner, length_name, weight_name,weight_norm)``
 
-- name : feature name
+- sparsefeat : a instance of `SparseFeat`
 - maxlen : maximum length of this feature for all samples
-- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
-- embedding_dim : embedding dimension
 - combiner : pooling method,can be ``sum``,``mean`` or ``max``
-- use_hash : defualt `False`.if `True` the input will be hashed to space of size `vocabulary_size`.
-- dtype : default `float32`.dtype of input tensor.
 - length_name : feature length name,if `None`, value 0 in feature is for padding.
 - weight_name : default `None`. If not None, the sequence feature will be multiplyed by the feature whose name is `weight_name`.
-- embedding_name : default `None`. If None, the `embedding_name` will be same as `name`.
-- group_name : feature group of this feature.
+- weight_norm : default `True`. Whether normalize the weight score or not.
 
 ## Models
 

diff --git a/docs/source/History.md b/docs/source/History.md
@@ -1,6 +1,6 @@
 # History
+- 01/28/2020 : [v0.7.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.1) released.Simplify [VarLenSparseFeat](./Features.html#varlensparsefeat),support setting weight_normalization.Fix problem of embedding size of `SparseFeat` in `linear_feature_columns`.
 - 11/24/2019 : [v0.7.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.0) released.Refactor [feature columns](./Features.html#feature-columns).Different features can use different `embedding_dim` and group-wise interaction is available by setting `group_name`.
-
 - 11/06/2019 : [v0.6.3](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.3) released.Add `WeightedSequenceLayer` and support [weighted sequence feature input](./Examples.html#multi-value-input-movielens).
 - 10/03/2019 : [v0.6.2](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.2) released.Simplify the input logic.
 - 09/08/2019 : [v0.6.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.1) released.Fix bugs in `CCPM` and `DynamicGRU`.

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.7.0'
+release = '0.7.1'
 
 
 # -- General configuration ---------------------------------------------------