diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index a21b2abc..a99cfe41 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -19,8 +19,8 @@ Steps to reproduce the behavior:
 
 **Operating environment(运行环境):**
  - python version [e.g. 3.6, 3.7]
- - tensorflow version [e.g. 1.4.0, 1.15.0, 2.5.0]
- - deepctr version [e.g. 0.9.0,]
+ - tensorflow version [e.g. 1.4.0, 1.15.0, 2.10.0]
+ - deepctr version [e.g. 0.9.2,]
 
 **Additional context**
 Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
index 8aaf7ee6..8b7f819d 100644
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -16,5 +16,5 @@ Add any other context about the problem here.
 
 **Operating environment(运行环境):**
  - python version [e.g. 3.6]
- - tensorflow version [e.g. 1.4.0, 1.15.0, 2.5.0]
- - deepctr version [e.g. 0.9.0,]
+ - tensorflow version [e.g. 1.4.0, 1.15.0, 2.10.0]
+ - deepctr version [e.g. 0.9.2,]
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 44bcc9a5..7ed5bd15 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -17,8 +17,8 @@ jobs:
     timeout-minutes: 180
     strategy:
       matrix:
-        python-version: [3.6,3.7,3.8]
-        tf-version: [1.4.0,1.15.0,2.5.0,2.6.0,2.7.0,2.8.0,2.9.0]
+        python-version: [3.6,3.7,3.8,3.9,3.10.7]
+        tf-version: [1.4.0,1.15.0,2.6.0,2.7.0,2.8.0,2.9.0,2.10.0]
         
         exclude:
           - python-version: 3.7
@@ -37,12 +37,32 @@ jobs:
             tf-version: 2.8.0
           - python-version: 3.6
             tf-version: 2.9.0
+          - python-version: 3.6
+            tf-version: 2.10.0
           - python-version: 3.9
             tf-version: 1.4.0
           - python-version: 3.9
             tf-version: 1.15.0
           - python-version: 3.9
             tf-version: 2.2.0
+          - python-version: 3.9
+            tf-version: 2.5.0
+          - python-version: 3.9
+            tf-version: 2.6.0
+          - python-version: 3.9
+            tf-version: 2.7.0
+          - python-version: 3.10.7
+            tf-version: 1.4.0
+          - python-version: 3.10.7
+            tf-version: 1.15.0
+          - python-version: 3.10.7
+            tf-version: 2.2.0
+          - python-version: 3.10.7
+            tf-version: 2.5.0
+          - python-version: 3.10.7
+            tf-version: 2.6.0
+          - python-version: 3.10.7
+            tf-version: 2.7.0
     steps:
     
     - uses: actions/checkout@v3
diff --git a/README.md b/README.md
index 70ec4e28..f0d90c13 100644
--- a/README.md
+++ b/README.md
@@ -18,14 +18,12 @@
 <!-- [![Gitter](https://badges.gitter.im/DeepCTR/community.svg)](https://gitter.im/DeepCTR/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
 
 
-DeepCTR is a **Easy-to-use**,**Modular** and **Extendible** package of deep-learning based CTR models along with lots of
+DeepCTR is a **Easy-to-use**, **Modular** and **Extendible** package of deep-learning based CTR models along with lots of
 core components layers which can be used to easily build custom models.You can use any complex model with `model.fit()`
 ，and `model.predict()` .
 
-- Provide `tf.keras.Model` like interface for **quick experiment**
-  . [example](https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html#getting-started-4-steps-to-deepctr)
-- Provide  `tensorflow estimator` interface for **large scale data** and **distributed training**
-  . [example](https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html#getting-started-4-steps-to-deepctr-estimator-with-tfrecord)
+- Provide `tf.keras.Model` like interfaces for **quick experiment**. [example](https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html#getting-started-4-steps-to-deepctr)
+- Provide  `tensorflow estimator` interface for **large scale data** and **distributed training**. [example](https://deepctr-doc.readthedocs.io/en/latest/Quick-Start.html#getting-started-4-steps-to-deepctr-estimator-with-tfrecord)
 - It is compatible with both `tf 1.x`  and `tf 2.x`.
 
 Some related projects:
diff --git a/deepctr/__init__.py b/deepctr/__init__.py
index d42e620d..3c6d40b5 100644
--- a/deepctr/__init__.py
+++ b/deepctr/__init__.py
@@ -1,4 +1,4 @@
 from .utils import check_version
 
-__version__ = '0.9.1'
+__version__ = '0.9.2'
 check_version(__version__)
diff --git a/deepctr/feature_column.py b/deepctr/feature_column.py
index 6f277ba1..5cc1930e 100644
--- a/deepctr/feature_column.py
+++ b/deepctr/feature_column.py
@@ -95,7 +95,7 @@ def __hash__(self):
 class DenseFeat(namedtuple('DenseFeat', ['name', 'dimension', 'dtype', 'transform_fn'])):
     """ Dense feature
     Args:
-        name: feature name,
+        name: feature name.
         dimension: dimension of the feature, default = 1.
         dtype: dtype of the feature, default="float32".
         transform_fn: If not `None` , a function that can be used to transform
diff --git a/deepctr/layers/sequence.py b/deepctr/layers/sequence.py
index 45a65915..93866640 100644
--- a/deepctr/layers/sequence.py
+++ b/deepctr/layers/sequence.py
@@ -442,7 +442,7 @@ class Transformer(Layer):
             - **blinding**: bool. Whether or not use blinding.
             - **seed**: A Python integer to use as random seed.
             - **supports_masking**:bool. Whether or not support masking.
-            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'additive'`` }.
+            - **attention_type**: str, Type of attention, the value must be one of { ``'scaled_dot_product'`` , ``'cos'`` , ``'ln'`` , ``'additive'`` }.
             - **output_type**: ``'mean'`` , ``'sum'`` or `None`. Whether or not use average/sum pooling for output.
 
       References
@@ -490,6 +490,9 @@ def build(self, input_shape):
                                      initializer=glorot_uniform(seed=self.seed))
             self.v = self.add_weight('v', shape=[self.att_embedding_size], dtype=tf.float32,
                                      initializer=glorot_uniform(seed=self.seed))
+        elif self.attention_type == "ln":
+            self.att_ln_q = LayerNormalization()
+            self.att_ln_k = LayerNormalization()
         # if self.use_res:
         #     self.W_Res = self.add_weight(name='res', shape=[embedding_size, self.att_embedding_size * self.head_num], dtype=tf.float32,
         #                                  initializer=TruncatedNormal(seed=self.seed))
@@ -529,28 +532,42 @@ def call(self, inputs, mask=None, training=None, **kwargs):
             queries = self.query_pe(queries)
             keys = self.key_pe(queries)
 
-        querys = tf.tensordot(queries, self.W_Query,
-                              axes=(-1, 0))  # None T_q D*head_num
-        keys = tf.tensordot(keys, self.W_key, axes=(-1, 0))
-        values = tf.tensordot(keys, self.W_Value, axes=(-1, 0))
+        Q = tf.tensordot(queries, self.W_Query,
+                         axes=(-1, 0))  # N T_q D*h
+        K = tf.tensordot(keys, self.W_key, axes=(-1, 0))
+        V = tf.tensordot(keys, self.W_Value, axes=(-1, 0))
 
-        # head_num*None T_q D
-        querys = tf.concat(tf.split(querys, self.head_num, axis=2), axis=0)
-        keys = tf.concat(tf.split(keys, self.head_num, axis=2), axis=0)
-        values = tf.concat(tf.split(values, self.head_num, axis=2), axis=0)
+        # h*N T_q D
+        Q_ = tf.concat(tf.split(Q, self.head_num, axis=2), axis=0)
+        K_ = tf.concat(tf.split(K, self.head_num, axis=2), axis=0)
+        V_ = tf.concat(tf.split(V, self.head_num, axis=2), axis=0)
 
         if self.attention_type == "scaled_dot_product":
-            # head_num*None T_q T_k
-            outputs = tf.matmul(querys, keys, transpose_b=True)
+            # h*N T_q T_k
+            outputs = tf.matmul(Q_, K_, transpose_b=True)
 
-            outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
+            outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
+        elif self.attention_type == "cos":
+            Q_cos = tf.nn.l2_normalize(Q_, dim=-1)
+            K_cos = tf.nn.l2_normalize(K_, dim=-1)
+
+            outputs = tf.matmul(Q_cos, K_cos, transpose_b=True)  # h*N T_q T_k
+
+            outputs = outputs * 20  # Scale
+        elif self.attention_type == 'ln':
+            Q_ = self.att_ln_q(Q_)
+            K_ = self.att_ln_k(K_)
+
+            outputs = tf.matmul(Q_, K_, transpose_b=True)  # h*N T_q T_k
+            # Scale
+            outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)
         elif self.attention_type == "additive":
-            querys_reshaped = tf.expand_dims(querys, axis=-2)
-            keys_reshaped = tf.expand_dims(keys, axis=-3)
-            outputs = tf.tanh(tf.nn.bias_add(querys_reshaped + keys_reshaped, self.b))
+            Q_reshaped = tf.expand_dims(Q_, axis=-2)
+            K_reshaped = tf.expand_dims(K_, axis=-3)
+            outputs = tf.tanh(tf.nn.bias_add(Q_reshaped + K_reshaped, self.b))
             outputs = tf.squeeze(tf.tensordot(outputs, tf.expand_dims(self.v, axis=-1), axes=[-1, 0]), axis=-1)
         else:
-            raise ValueError("attention_type must be scaled_dot_product or additive")
+            raise ValueError("attention_type must be [scaled_dot_product,cos,ln,additive]")
 
         key_masks = tf.tile(key_masks, [self.head_num, 1])
 
@@ -583,7 +600,7 @@ def call(self, inputs, mask=None, training=None, **kwargs):
         outputs = self.dropout(outputs, training=training)
         # Weighted sum
         # ( h*N, T_q, C/h)
-        result = tf.matmul(outputs, values)
+        result = tf.matmul(outputs, V_)
         result = tf.concat(tf.split(result, self.head_num, axis=0), axis=2)
 
         if self.use_res:
diff --git a/deepctr/models/deepfm.py b/deepctr/models/deepfm.py
index 49456f4f..f156e5fb 100644
--- a/deepctr/models/deepfm.py
+++ b/deepctr/models/deepfm.py
@@ -24,8 +24,8 @@ def DeepFM(linear_feature_columns, dnn_feature_columns, fm_group=(DEFAULT_GROUP_
            dnn_activation='relu', dnn_use_bn=False, task='binary'):
     """Instantiates the DeepFM Network architecture.
 
-    :param linear_feature_columns: An iterable containing all the features used by linear part of the model.
-    :param dnn_feature_columns: An iterable containing all the features used by deep part of the model.
+    :param linear_feature_columns: An iterable containing all the features used by the linear part of the model.
+    :param dnn_feature_columns: An iterable containing all the features used by the deep part of the model.
     :param fm_group: list, group_name of features that will be used to do feature interactions.
     :param dnn_hidden_units: list,list of positive integer or empty list, the layer number and units in each layer of DNN
     :param l2_reg_linear: float. L2 regularizer strength applied to linear part
diff --git a/docs/source/History.md b/docs/source/History.md
index e74dba8d..2e19942a 100644
--- a/docs/source/History.md
+++ b/docs/source/History.md
@@ -1,4 +1,5 @@
 # History
+- 10/15/2022 : [v0.9.2](https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.2) released.Support python `3.9`,`3.10`.
 - 06/11/2022 : [v0.9.1](https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.1) released.Improve compatibility with tensorflow `2.x`.
 - 09/03/2021 : [v0.9.0](https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0) released.Add multitask learning models:[SharedBottom](./Features.html#sharedbottom),[ESMM](./Features.html#esmm-entire-space-multi-task-model),[MMOE](./Features.html#mmoe-multi-gate-mixture-of-experts) and [PLE](./Features.html#ple-progressive-layered-extraction). [running example](./Examples.html#multitask-learning-mmoe)
 - 07/18/2021 : [v0.8.7](https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7) released.Support pre-defined key-value vocabulary in `Hash` Layer. [example](./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary)
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 50b0f80e..d0f0df24 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = ''
 # The full version, including alpha/beta/rc tags
-release = '0.9.1'
+release = '0.9.2'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c64d26e5..0330a10d 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -42,12 +42,12 @@ You can read the latest code and related projects
 
 News
 -----
+10/15/2022 : Support python `3.9`,`3.10`. `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.2>`_
+
 06/11/2022 : Improve compatibility with tensorflow `2.x`. `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.1>`_
 
 09/03/2021 : Add multitask learning models: `SharedBottom <./Features.html#sharedbottom>`_ , `ESMM <./Features.html#esmm-entire-space-multi-task-model>`_ , `MMOE <./Features.html#mmoe-multi-gate-mixture-of-experts>`_ , `PLE <./Features.html#ple-progressive-layered-extraction>`_ .  `running example <./Examples.html#multitask-learning-mmoe>`_ `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0>`_
 
-07/18/2021 : Support pre-defined key-value vocabulary in `Hash` Layer. `example <./Examples.html#hash-layer-with-pre-defined-key-value-vocabulary>`_ `Changelog <https://github.com/shenweichen/DeepCTR/releases/tag/v0.8.7>`_
-
 DisscussionGroup
 -----------------------
 
diff --git a/setup.py b/setup.py
index 62316c0c..43eee556 100644
--- a/setup.py
+++ b/setup.py
@@ -1,15 +1,21 @@
 import setuptools
 
-with open("README.md", "r") as fh:
+with open("README.md", "r",encoding='utf-8') as fh:
     long_description = fh.read()
 
-REQUIRED_PACKAGES = [
+import sys
+if sys.version_info < (3, 9):
+    REQUIRED_PACKAGES = [
     'h5py==2.10.0', 'requests'
-]
+    ]
+else:
+    REQUIRED_PACKAGES = [
+    'h5py==3.7.0', 'requests'
+    ]
 
 setuptools.setup(
     name="deepctr",
-    version="0.9.1",
+    version="0.9.2",
     author="Weichen Shen",
     author_email="weichenswc@163.com",
     description="Easy-to-use,Modular and Extendible package of deep learning based CTR(Click Through Rate) prediction models with tensorflow 1.x and 2.x .",
@@ -35,10 +41,11 @@
         'Intended Audience :: Science/Research',
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
         'Programming Language :: Python :: 3.7',
         'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
         'Topic :: Scientific/Engineering',
         'Topic :: Scientific/Engineering :: Artificial Intelligence',
         'Topic :: Software Development',
diff --git a/tests/layers/sequence_test.py b/tests/layers/sequence_test.py
index 1639baca..1e2a89a6 100644
--- a/tests/layers/sequence_test.py
+++ b/tests/layers/sequence_test.py
@@ -81,11 +81,15 @@ def test_BiLSTM(merge_mode):
                    input_shape=(BATCH_SIZE, SEQ_LENGTH, EMBEDDING_SIZE))
 
 
-def test_Transformer():
+@pytest.mark.parametrize(
+    'attention_type',
+    ['scaled_dot_product', 'cos', 'ln', 'additive']
+)
+def test_Transformer(attention_type):
     with CustomObjectScope({'Transformer': sequence.Transformer}):
         layer_test(sequence.Transformer,
                    kwargs={'att_embedding_size': 1, 'head_num': 8, 'use_layer_norm': True, 'supports_masking': False,
-                           'attention_type': 'additive', 'dropout_rate': 0.5, 'output_type': 'sum'},
+                           'attention_type': attention_type, 'dropout_rate': 0.5, 'output_type': 'sum'},
                    input_shape=[(BATCH_SIZE, SEQ_LENGTH, EMBEDDING_SIZE), (BATCH_SIZE, SEQ_LENGTH, EMBEDDING_SIZE),
                                 (BATCH_SIZE, 1), (BATCH_SIZE, 1)])