Skip to content

Commit

Permalink
v0.7.1
Browse files Browse the repository at this point in the history
* Simplify `VarLenSparseFeat`, support setting weight_normalization.

* Fix problem of embedding size of `SparseFeat` in `linear_feature_columns`.
  • Loading branch information
浅梦 committed Jan 28, 2020
1 parent db229dc commit dcf583f
Show file tree
Hide file tree
Showing 22 changed files with 129 additions and 109 deletions.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug_report.md
Expand Up @@ -20,7 +20,7 @@ Steps to reproduce the behavior:
**Operating environment(运行环境):**
- python version [e.g. 3.4, 3.6]
- tensorflow version [e.g. 1.4.0, 1.12.0]
- deepctr version [e.g. 0.5.2,]
- deepctr version [e.g. 0.7.1,]

**Additional context**
Add any other context about the problem here.
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/question.md
Expand Up @@ -17,4 +17,4 @@ Add any other context about the problem here.
**Operating environment(运行环境):**
- python version [e.g. 3.6]
- tensorflow version [e.g. 1.4.0,]
- deepctr version [e.g. 0.5.2,]
- deepctr version [e.g. 0.7.1,]
2 changes: 1 addition & 1 deletion deepctr/__init__.py
@@ -1,4 +1,4 @@
from .utils import check_version

__version__ = '0.7.0'
__version__ = '0.7.1'
check_version(__version__)
109 changes: 62 additions & 47 deletions deepctr/inputs.py
Expand Up @@ -14,67 +14,74 @@
from tensorflow.python.keras.regularizers import l2

from .layers.sequence import SequencePoolingLayer, WeightedSequenceLayer
from .layers.utils import Hash, concat_func, Linear,add_func
from .layers.utils import Hash, concat_func, Linear, add_func

DEFAULT_GROUP_NAME = "default_group"


class SparseFeat(namedtuple('SparseFeat',
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embedding_name', 'group_name'])):
['name', 'vocabulary_size', 'embedding_dim', 'use_hash', 'dtype', 'embedding_name',
'group_name'])):
__slots__ = ()

def __new__(cls, name, vocabulary_size, embedding_dim=4, use_hash=False, dtype="int32", embedding_name=None,
group_name=DEFAULT_GROUP_NAME):
if embedding_name is None:
embedding_name = name
if embedding_dim == "auto":
embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
return super(SparseFeat, cls).__new__(cls, name, vocabulary_size, embedding_dim, use_hash, dtype,
embedding_name, group_name)

def __hash__(self):
return self.name.__hash__()

# def __eq__(self, other):
# if self.name == other.name and self.embedding_name == other.embedding_name:
# return True
# return False

# def __repr__(self):
# return 'SparseFeat:'+self.name
class VarLenSparseFeat(namedtuple('VarLenSparseFeat',
['sparsefeat', 'maxlen', 'combiner', 'length_name', 'weight_name', 'weight_norm'])):
__slots__ = ()

def __new__(cls, sparsefeat, maxlen, combiner="mean", length_name=None, weight_name=None, weight_norm=True):
return super(VarLenSparseFeat, cls).__new__(cls, sparsefeat, maxlen, combiner, length_name, weight_name,
weight_norm)

class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
__slots__ = ()
@property
def name(self):
return self.sparsefeat.name

def __new__(cls, name, dimension=1, dtype="float32"):
return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)
@property
def vocabulary_size(self):
return self.sparsefeat.vocabulary_size

def __hash__(self):
return self.name.__hash__()
@property
def embedding_dim(self):
return self.sparsefeat.embedding_dim

# def __eq__(self, other):
# if self.name == other.name:
# return True
# return False
@property
def use_hash(self):
return self.sparsefeat.use_hash

# def __repr__(self):
# return 'DenseFeat:'+self.name
@property
def dtype(self):
return self.sparsefeat.dtype

@property
def embedding_name(self):
return self.sparsefeat.embedding_name

class VarLenSparseFeat(namedtuple('VarLenFeat',
['name', 'maxlen', 'vocabulary_size', 'embedding_dim', 'combiner', 'use_hash',
'dtype','length_name' ,'weight_name', 'embedding_name', 'group_name'])):
@property
def group_name(self):
return self.sparsefeat.group_name

def __hash__(self):
return self.name.__hash__()


class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype'])):
__slots__ = ()

def __new__(cls, name, maxlen, vocabulary_size, embedding_dim=4, combiner="mean", use_hash=False, dtype="float32",
length_name=None, weight_name=None, embedding_name=None, group_name=DEFAULT_GROUP_NAME):
if embedding_name is None:
embedding_name = name
if embedding_dim == "auto":
embedding_dim = 6 * int(pow(vocabulary_size, 0.25))
return super(VarLenSparseFeat, cls).__new__(cls, name, maxlen, vocabulary_size, embedding_dim, combiner,
use_hash, dtype, length_name,weight_name, embedding_name, group_name)
def __new__(cls, name, dimension=1, dtype="float32"):
return super(DenseFeat, cls).__new__(cls, name, dimension, dtype)

def __hash__(self):
return self.name.__hash__()
Expand All @@ -85,7 +92,7 @@ def __hash__(self):
# return False

# def __repr__(self):
# return 'VarLenSparseFeat:'+self.name
# return 'DenseFeat:'+self.name


def get_feature_names(feature_columns):
Expand All @@ -111,9 +118,9 @@ def build_input_features(feature_columns, prefix=''):
dtype=fc.dtype)
if fc.weight_name is not None:
input_features[fc.weight_name] = Input(shape=(fc.maxlen, 1), name=prefix + fc.weight_name,
dtype="float32")
dtype="float32")
if fc.length_name is not None:
input_features[fc.length_name] = Input((1,),name=prefix+fc.length_name,dtype='int32')
input_features[fc.length_name] = Input((1,), name=prefix + fc.length_name, dtype='int32')

else:
raise TypeError("Invalid feature column type,got", type(fc))
Expand All @@ -123,12 +130,12 @@ def build_input_features(feature_columns, prefix=''):

def create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, init_std, seed, l2_reg,
prefix='sparse_', seq_mask_zero=True):
sparse_embedding = {feat.embedding_name:Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=RandomNormal(
mean=0.0, stddev=init_std, seed=seed),
embeddings_regularizer=l2(l2_reg),
name=prefix + '_emb_' + feat.embedding_name) for feat in sparse_feature_columns}

sparse_embedding = {feat.embedding_name: Embedding(feat.vocabulary_size, feat.embedding_dim,
embeddings_initializer=RandomNormal(
mean=0.0, stddev=init_std, seed=seed),
embeddings_regularizer=l2(l2_reg),
name=prefix + '_emb_' + feat.embedding_name) for feat in
sparse_feature_columns}

if varlen_sparse_feature_columns and len(varlen_sparse_feature_columns) > 0:
for feat in varlen_sparse_feature_columns:
Expand Down Expand Up @@ -160,7 +167,7 @@ def get_embedding_vec_list(embedding_dict, input_dict, sparse_feature_columns, r

def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, prefix="", seq_mask_zero=True):
sparse_feature_columns = list(
filter(lambda x: isinstance(x, SparseFeat) , feature_columns)) if feature_columns else []
filter(lambda x: isinstance(x, SparseFeat), feature_columns)) if feature_columns else []
varlen_sparse_feature_columns = list(
filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns)) if feature_columns else []
sparse_emb_dict = create_embedding_dict(sparse_feature_columns, varlen_sparse_feature_columns, init_std, seed,
Expand All @@ -170,25 +177,32 @@ def create_embedding_matrix(feature_columns, l2_reg, init_std, seed, prefix="",

def get_linear_logit(features, feature_columns, units=1, use_bias=False, init_std=0.0001, seed=1024, prefix='linear',
l2_reg=0):
for i in range(len(feature_columns)):
if isinstance(feature_columns[i], SparseFeat):
feature_columns[i] = feature_columns[i]._replace(embedding_dim=1)
if isinstance(feature_columns[i], VarLenSparseFeat):
feature_columns[i] = feature_columns[i]._replace(
sparsefeat=feature_columns[i].sparsefeat._replace(embedding_dim=1))

linear_emb_list = [input_from_feature_columns(features, feature_columns, l2_reg, init_std, seed,
prefix=prefix + str(i))[0] for i in range(units)]
_, dense_input_list = input_from_feature_columns(features, feature_columns, l2_reg, init_std, seed, prefix=prefix)

linear_logit_list = []
for i in range(units):

if len(linear_emb_list[0]) > 0 and len(dense_input_list) > 0:
if len(linear_emb_list[i]) > 0 and len(dense_input_list) > 0:
sparse_input = concat_func(linear_emb_list[i])
dense_input = concat_func(dense_input_list)
linear_logit = Linear(l2_reg, mode=2, use_bias=use_bias)([sparse_input, dense_input])
elif len(linear_emb_list[0]) > 0:
elif len(linear_emb_list[i]) > 0:
sparse_input = concat_func(linear_emb_list[i])
linear_logit = Linear(l2_reg, mode=0, use_bias=use_bias)(sparse_input)
elif len(dense_input_list) > 0:
dense_input = concat_func(dense_input_list)
linear_logit = Linear(l2_reg, mode=1, use_bias=use_bias)(dense_input)
else:
#raise NotImplementedError
# raise NotImplementedError
return add_func([])
linear_logit_list.append(linear_logit)

Expand Down Expand Up @@ -235,15 +249,15 @@ def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_colu
feature_length_name = fc.length_name
if feature_length_name is not None:
if fc.weight_name is not None:
seq_input = WeightedSequenceLayer()(
seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm)(
[embedding_dict[feature_name], features[feature_length_name], features[fc.weight_name]])
else:
seq_input = embedding_dict[feature_name]
vec = SequencePoolingLayer(combiner, supports_masking=False)(
[seq_input, features[feature_length_name]])
else:
if fc.weight_name is not None:
seq_input = WeightedSequenceLayer(supports_masking=True)(
seq_input = WeightedSequenceLayer(weight_normalization=fc.weight_norm, supports_masking=True)(
[embedding_dict[feature_name], features[fc.weight_name]])
else:
seq_input = embedding_dict[feature_name]
Expand All @@ -254,6 +268,7 @@ def get_varlen_pooling_list(embedding_dict, features, varlen_sparse_feature_colu
return chain.from_iterable(pooling_vec_list.values())
return pooling_vec_list


def get_dense_input(features, feature_columns):
dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns)) if feature_columns else []
dense_input_list = []
Expand Down
2 changes: 1 addition & 1 deletion deepctr/layers/core.py
Expand Up @@ -66,7 +66,7 @@ def build(self, input_shape):
if input_shape[0][-1] != input_shape[1][-1] or input_shape[0][1] != 1:
raise ValueError('A `LocalActivationUnit` layer requires '
'inputs of a two inputs with shape (None,1,embedding_size) and (None,T,embedding_size)'
'Got different shapes: %s,%s' % (input_shape))
'Got different shapes: %s,%s' % (input_shape[0],input_shape[1]))
size = 4 * \
int(input_shape[0][-1]
) if len(self.hidden_units) == 0 else self.hidden_units[-1]
Expand Down
4 changes: 2 additions & 2 deletions deepctr/layers/sequence.py
Expand Up @@ -121,12 +121,12 @@ class WeightedSequenceLayer(Layer):
- 3D tensor with shape: ``(batch_size, T, embedding_size)``.
Arguments
- **weight_normalization**: bool.Whether normalize the weight socre before applying to sequence.
- **weight_normalization**: bool.Whether normalize the weight score before applying to sequence.
- **supports_masking**:If True,the input need to support masking.
"""

def __init__(self,weight_normalization=False, supports_masking=False, **kwargs):
def __init__(self,weight_normalization=True, supports_masking=False, **kwargs):
super(WeightedSequenceLayer, self).__init__(**kwargs)
self.weight_normalization = weight_normalization
self.supports_masking = supports_masking
Expand Down
2 changes: 1 addition & 1 deletion deepctr/utils.py
Expand Up @@ -40,7 +40,7 @@ def check(version):
'\nDeepCTR version {0} detected. Your version is {1}.\nUse `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v{0}'.format(
latest_version, version))
except Exception as e:
print(e)
print("Please check the latest version manually on https://pypi.org/project/deepctr/#history")
return

Thread(target=check, args=(version,)).start()
Expand Down
22 changes: 12 additions & 10 deletions docs/source/Examples.md
Expand Up @@ -246,11 +246,11 @@ if __name__ == "__main__":

use_weighted_sequence = False
if use_weighted_sequence:
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen= max_len,vocabulary_size=len(
key2index) + 1,embedding_dim=4, combiner='mean',weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',vocabulary_size=len(
key2index) + 1,embedding_dim=4), maxlen= max_len, combiner='mean',weight_name='genres_weight')] # Notice : value 0 is for padding for sequence input feature
else:
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size= len(
key2index) + 1,embedding_dim=4, combiner='mean',weight_name=None)] # Notice : value 0 is for padding for sequence input feature
varlen_feature_columns = [VarLenSparseFeat(SparseFeat('genres',vocabulary_size= len(
key2index) + 1,embedding_dim=4), maxlen=max_len, combiner='mean',weight_name=None)] # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
Expand Down Expand Up @@ -279,8 +279,8 @@ import numpy as np
import pandas as pd
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.inputs import SparseFeat, VarLenSparseFeat, get_feature_names
from deepctr.models import DeepFM
from deepctr.inputs import SparseFeat, VarLenSparseFeat,get_feature_names

if __name__ == "__main__":
data = pd.read_csv("./movielens_sample.txt")
Expand All @@ -301,20 +301,22 @@ if __name__ == "__main__":

# 2.set hashing space for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5,embedding_dim=4, use_hash=True, dtype='string')
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
for feat in sparse_features]
varlen_feature_columns = [VarLenSparseFeat('genres', maxlen=max_len,vocabulary_size=100,embedding_dim=4,combiner= 'mean', use_hash=True,
dtype="string")] # Notice : value 0 is for padding for sequence input feature
varlen_feature_columns = [
VarLenSparseFeat(SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
maxlen=max_len, combiner='mean',
)] # Notice : value 0 is for padding for sequence input feature
linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
model_input = {name:data[name] for name in feature_names}
model_input = {name: data[name] for name in feature_names}
model_input['genres'] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns,dnn_feature_columns, task='regression')
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(model_input, data[target].values,
Expand Down
11 changes: 3 additions & 8 deletions docs/source/Features.md
Expand Up @@ -42,19 +42,14 @@ DNN based CTR prediction models usually have following 4 modules:

### VarLenSparseFeat

``VarLenSparseFeat`` is a namedtuple with signature ``VarLenSparseFeat(name, maxlen, vocabulary_size, embedding_dim, combiner,use_hash, dtype, length_name,weight_name, embedding_name, group_name)``
``VarLenSparseFeat`` is a namedtuple with signature ``VarLenSparseFeat(sparsefeat, maxlen, combiner, length_name, weight_name,weight_norm)``

- name : feature name
- sparsefeat : a instance of `SparseFeat`
- maxlen : maximum length of this feature for all samples
- vocabulary_size : number of unique feature values for sprase feature or hashing space when `use_hash=True`
- embedding_dim : embedding dimension
- combiner : pooling method,can be ``sum``,``mean`` or ``max``
- use_hash : defualt `False`.if `True` the input will be hashed to space of size `vocabulary_size`.
- dtype : default `float32`.dtype of input tensor.
- length_name : feature length name,if `None`, value 0 in feature is for padding.
- weight_name : default `None`. If not None, the sequence feature will be multiplyed by the feature whose name is `weight_name`.
- embedding_name : default `None`. If None, the `embedding_name` will be same as `name`.
- group_name : feature group of this feature.
- weight_norm : default `True`. Whether normalize the weight score or not.

## Models

Expand Down
2 changes: 1 addition & 1 deletion docs/source/History.md
@@ -1,6 +1,6 @@
# History
- 01/28/2020 : [v0.7.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.1) released.Simplify [VarLenSparseFeat](./Features.html#varlensparsefeat),support setting weight_normalization.Fix problem of embedding size of `SparseFeat` in `linear_feature_columns`.
- 11/24/2019 : [v0.7.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.7.0) released.Refactor [feature columns](./Features.html#feature-columns).Different features can use different `embedding_dim` and group-wise interaction is available by setting `group_name`.

- 11/06/2019 : [v0.6.3](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.3) released.Add `WeightedSequenceLayer` and support [weighted sequence feature input](./Examples.html#multi-value-input-movielens).
- 10/03/2019 : [v0.6.2](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.2) released.Simplify the input logic.
- 09/08/2019 : [v0.6.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.6.1) released.Fix bugs in `CCPM` and `DynamicGRU`.
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = ''
# The full version, including alpha/beta/rc tags
release = '0.7.0'
release = '0.7.1'


# -- General configuration ---------------------------------------------------
Expand Down

0 comments on commit dcf583f

Please sign in to comment.