<a href="https://colab.research.google.com/github/rurusasu/RecommendSystem/blob/main/IntTowerRecommendModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 自作データセットで TowTowerRecommendModel を試験する
参考:
* [1つの Two-Tower モデルで4種類の推薦を実現する](https://qiita.com/rayuron/items/cc04468de7af3d82bac2)
* [Two-Tower Recommendationについて](https://qiita.com/akira_saigo/items/fb07c970febc872d22e3)

データセット: MovieLens-100k

In [2]:
# Googleドライブのマウント
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
base_dir = "/content/drive/MyDrive/ColabNotebooks"

In [4]:
!pip install  --upgrade -q tensorflow_recommenders tensorflow-datasets apache-beam

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.2/96.2 kB[0m [31m696.9 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.5/14.5 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.7/142.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
import re
import os
import tempfile
from typing import List, Dict, Union, Tuple

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import matplotlib.pyplot as plt
from IPython.display import clear_output

plt.style.use("seaborn-whitegrid")

  plt.style.use("seaborn-whitegrid")


# データセットの読み出しと前処理

In [6]:
base_dir = "/content/drive/MyDrive/Google AI Studio"

In [7]:
# エクセルファイルからデータを読み込む
data = pd.read_excel(f"{base_dir}/sample_merged_full_10k.xlsx")

data.head(2)

Unnamed: 0,user_id,target_id,rating,user_name_target,nickname_target,gender_target,location_target,age_range_target,height_range_target,body_type_target,...,body_type_user,personality_user,appearance_user,job_user,blood_type_user,car_user,interests_user,salary_user,plan_user,account_creation_timestamp_user
0,1,8627,0,原田遥,アオイ,女性,埼玉県伊奈町,45-49,150-154,普通,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34
1,1,18213,0,井上萌,ユイ,女性,福島県玉川村,30-34,150-154,スリム,...,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000,option2,2024-01-14 00:11:34


In [8]:
from tensorflow.keras.utils import to_categorical

# ラベルに不正な値が含まれている可能性があるので、削除
cleaning_data = data[data['rating'].isin([0, 1, 2])]
cleaning_data = cleaning_data.dropna()

# 不要なカラムの削除
cleaning_data = cleaning_data.drop(columns=['user_id', 'target_id', 'user_name_target', 'nickname_target',
                     'plan_target', 'account_creation_timestamp_target',
                     'user_name_user', 'nickname_user', 'plan_user',
                     'account_creation_timestamp_user'])

# 特徴量とラベルの分離
features = cleaning_data.drop(columns=['rating'])
labels = cleaning_data['rating']
# ラベルをOne-Hotエンコーディング
labels = to_categorical(labels, num_classes=3)

# データの確認
features.head(2)

Unnamed: 0,gender_target,location_target,age_range_target,height_range_target,body_type_target,personality_target,appearance_target,job_target,blood_type_target,car_target,...,age_range_user,height_range_user,body_type_user,personality_user,appearance_user,job_user,blood_type_user,car_user,interests_user,salary_user
0,女性,埼玉県伊奈町,45-49,150-154,普通,面白い,-,会社員,A型,無し,...,22-25,175-179,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000
1,女性,福島県玉川村,30-34,150-154,スリム,明るい,セクシー系,会社員,A型,無し,...,22-25,175-179,スリム,元気,セクシー系,会社員,O型,有り,技術・プログラミング,8160000


In [9]:
print("features nan\n" + str(features.isna().sum()))
print(features.info())

features nan
gender_target          0
location_target        0
age_range_target       0
height_range_target    0
body_type_target       0
personality_target     0
appearance_target      0
job_target             0
blood_type_target      0
car_target             0
interests_target       0
salary_target          0
gender_user            0
location_user          0
age_range_user         0
height_range_user      0
body_type_user         0
personality_user       0
appearance_user        0
job_user               0
blood_type_user        0
car_user               0
interests_user         0
salary_user            0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108562 entries, 0 to 108561
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   gender_target        108562 non-null  object
 1   location_target      108562 non-null  object
 2   age_range_target     108562 non-null  object
 3   height_r

# モデル定義

## 付属クラス

In [10]:
# カテゴリカルな特徴量を表現するクラス
class SparseFeat:
    def __init__(self, name: str, vocabulary_size: int, embedding_dim: int):
        self.name = name
        self.vocabulary_size = vocabulary_size
        self.embedding_dim = embedding_dim

# 数値特徴量を表現するクラス
class DenseFeat:
    def __init__(self, name: str):
        self.name = name

# 可変長のカテゴリカル特徴量を表現するクラス
class VarLenSparseFeat:
    def __init__(self, name: str, vocabulary_size: int, embedding_dim: int):
        self.name = name
        self.vocabulary_size = vocabulary_size
        self.embedding_dim = embedding_dim

## 付属関数

In [11]:
# Embedding辞書の作成
def create_embedding_dict(feature_columns: List[Union[SparseFeat, DenseFeat, VarLenSparseFeat]]) -> Dict[str, tf.keras.layers.Embedding]:
    embedding_dict = {}
    for feat in feature_columns:
        if isinstance(feat, SparseFeat) or isinstance(feat, VarLenSparseFeat):
            embedding_dict[feat.name] = tf.keras.layers.Embedding(feat.vocabulary_size, feat.embedding_dim)
    return embedding_dict

# 入力データから_userで終わる特徴のみを抽出
def filter_user_inputs(inputs: Dict[str, tf.Tensor]) -> Dict[str, tf.Tensor]:
    #for k, v in inputs.items():
    #    print(f"Key: {k}, Value shape: {v.shape}, Value dtype: {v.dtype}")
    return {k: v for k, v in inputs.items() if k.endswith('_user')}

# スパース特徴量のエンベディングの作成
def create_sparse_embeddings(user_inputs: Dict[str, tf.Tensor], feature_columns: List[Union[SparseFeat, DenseFeat, VarLenSparseFeat]], embedding_dict: Dict[str, tf.keras.layers.Embedding]) -> Tuple[List[tf.Tensor], List[tf.Tensor]]:
    #print("User Inputs:", user_inputs.keys())  # デバッグ用にuser_inputsのキーを表示
    #print("Embedding Dict:", embedding_dict.keys())  # デバッグ用にembedding_dictのキーを表示
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), feature_columns))
    dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), feature_columns))
    varlen_sparse_feature_columns = list(filter(lambda x: isinstance(x, VarLenSparseFeat), feature_columns))

    sparse_embedding_list = [embedding_dict[feat.name](user_inputs[feat.name]) for feat in sparse_feature_columns if feat.name in user_inputs]
    varlen_sparse_embedding_list = [create_varlen_sparse_embedding(embedding_dict, user_inputs, feat) for feat in varlen_sparse_feature_columns if feat.name in user_inputs]
    dense_value_list = [user_inputs[feat.name] for feat in dense_feature_columns if feat.name in user_inputs]

    return sparse_embedding_list + varlen_sparse_embedding_list, dense_value_list

# VarLenSparseFeat のエンベディングを平均化して形状を変換
def create_varlen_sparse_embedding(embedding_dict: Dict[str, tf.keras.layers.Embedding], user_inputs: Dict[str, tf.Tensor], feature: VarLenSparseFeat) -> tf.Tensor:
    # 各インタレストのエンベディングを取得
    embedding = embedding_dict[feature.name](user_inputs[feature.name])
    #print(f"Original shape for {feature.name}: {user_inputs[feature.name].shape}")  # デバッグ用出力

    # エンベディングを平均化
    embedding_mean = tf.reduce_mean(embedding, axis=1)

    return embedding_mean

## 前処理クラス

In [12]:
# 前処理クラスの定義
class Preprocessor:
    def __init__(self, features: pd.DataFrame, embedding_dims: dict):
        self.embedding_dims = embedding_dims
        self.vocabularies = self.generate_vocabularies(features)
        self.lookup_layers = self.create_lookup_layers()

    def generate_vocabularies(self, features: pd.DataFrame) -> dict:
        vocabularies = {}
        for feature_name in self.embedding_dims:
            if features[feature_name].dtype == 'object' and not feature_name.startswith('interests_'):
                vocab = sorted(features[feature_name].dropna().unique())
                vocabularies[feature_name] = vocab
            elif feature_name.startswith('interests_'):
                all_interests = []
                for interests_str in features[feature_name].dropna():
                    all_interests.extend(re.split(r',\s*', interests_str))
                vocabularies[feature_name] = sorted(set(all_interests))
        return vocabularies

    def create_lookup_layers(self) -> dict:
        lookup_layers = {}
        for feature_name, vocab in self.vocabularies.items():
            if not feature_name.startswith('interests_'):
                lookup_layers[feature_name] = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token=None, num_oov_indices=1, output_mode='int')
            else:
                lookup_layers[feature_name] = tf.keras.layers.StringLookup(vocabulary=vocab, mask_token=None, output_mode='multi_hot')
        return lookup_layers

    def process(self, data: pd.DataFrame) -> dict:
        processed_data = {}
        # NaNを含むレコードを削除
        data = data.dropna(subset=self.lookup_layers.keys())

        for feature_name, lookup_layer in self.lookup_layers.items():
            if feature_name.startswith('interests_'):
                # interests_userの分割
                interests_list = data[feature_name].apply(lambda x: re.split(r',\s*', x)).tolist()
                max_len = max(map(len, interests_list))
                interests_padded = [interests + [''] * (max_len - len(interests)) for interests in interests_list]
                indices = lookup_layer(interests_padded)
                processed_data[feature_name] = indices
            else:
                indices = lookup_layer(data[feature_name].astype(str).values)
                indices = tf.where(indices == 0, tf.zeros_like(indices), indices - 1)  # インデックスの範囲を修正
                processed_data[feature_name] = indices
                max_index = len(self.vocabularies[feature_name])
                if tf.reduce_any(processed_data[feature_name] >= max_index):
                    raise ValueError(f"Feature '{feature_name}' contains out-of-vocabulary index.")

        # salary_userおよびsalary_targetの特別な前処理
        for feature_name in ['salary_user', 'salary_target']:
            if feature_name in data.columns:
                processed_data[feature_name] = tf.convert_to_tensor(data[feature_name].values, dtype=tf.float32)

        return processed_data

    def create_feature_columns(self) -> List[Union[SparseFeat, DenseFeat, VarLenSparseFeat]]:
        feature_columns = []
        for feature_name, vocab in self.vocabularies.items():
            if feature_name.startswith('interests_'):
                feature_columns.append(VarLenSparseFeat(feature_name, vocabulary_size=len(vocab), embedding_dim=self.embedding_dims[feature_name]))
            else:
                feature_columns.append(SparseFeat(feature_name, vocabulary_size=len(vocab), embedding_dim=self.embedding_dims[feature_name]))
        for feature_name in self.embedding_dims:
            if feature_name not in self.vocabularies:
                feature_columns.append(DenseFeat(feature_name))
        return feature_columns

# 例として、embedding_dimsを定義
embedding_dims = {
    'gender_user': 2,
    'gender_target': 2,
    'location_user': 10,
    'location_target': 10,
    'age_range_user': 5,
    'age_range_target': 5,
    'height_range_user': 5,
    'height_range_target': 5,
    'body_type_user': 8,
    'body_type_target': 8,
    'personality_user': 10,
    'personality_target': 10,
    'appearance_user': 8,
    'appearance_target': 8,
    'job_user': 15,
    'job_target': 15,
    'blood_type_user': 4,
    'blood_type_target': 4,
    'car_user': 2,
    'car_target': 2,
    'interests_user': 20,
    'interests_target': 20,
    'salary_user': 10,
    'salary_target': 10
}

# データ読み込み例
# df_sample = pd.read_excel('/path/to/sample_merged_full_10k.xlsx')

# Preprocessorインスタンスの作成
preprocessor = Preprocessor(features, embedding_dims)

# vocabulariesの確認
vocabularies = preprocessor.vocabularies
print(vocabularies)

# データの前処理
processed_data = preprocessor.process(features)

# feature_columnsの生成
feature_columns = preprocessor.create_feature_columns()
print(feature_columns)


{'gender_user': ['女性', '男性'], 'gender_target': ['女性', '男性'], 'location_user': ['三重県いなべ市', '三重県亀山市', '三重県伊勢市', '三重県伊賀市', '三重県南伊勢町', '三重県名張市', '三重県四日市市', '三重県多気町', '三重県大台町', '三重県大紀町', '三重県尾鷲市', '三重県川越町', '三重県度会町', '三重県御浜町', '三重県志摩市', '三重県明和町', '三重県朝日町', '三重県木曽岬町', '三重県東員町', '三重県松阪市', '三重県桑名市', '三重県津市', '三重県熊野市', '三重県玉城町', '三重県紀北町', '三重県紀宝町', '三重県菰野町', '三重県鈴鹿市', '三重県鳥羽市', '京都府与謝野町', '京都府久御山町', '京都府亀岡市', '京都府井手町', '京都府京丹後市', '京都府京丹波町', '京都府京田辺市', '京都府京都市', '京都府伊根町', '京都府八幡市', '京都府南丹市', '京都府南山城村', '京都府向日市', '京都府和束町', '京都府城陽市', '京都府大山崎町', '京都府宇治市', '京都府宇治田原町', '京都府宮津市', '京都府木津川市', '京都府福知山市', '京都府笠置町', '京都府精華町', '京都府綾部市', '京都府舞鶴市', '京都府長岡京市', '佐賀県みやき町', '佐賀県上峰町', '佐賀県伊万里市', '佐賀県佐賀市', '佐賀県吉野ヶ里町', '佐賀県唐津市', '佐賀県基山町', '佐賀県多久市', '佐賀県大町町', '佐賀県太良町', '佐賀県嬉野市', '佐賀県小城市', '佐賀県有田町', '佐賀県武雄市', '佐賀県江北町', '佐賀県玄海町', '佐賀県白石町', '佐賀県神埼市', '佐賀県鳥栖市', '佐賀県鹿島市', '兵庫県たつの市', '兵庫県三木市', '兵庫県三田市', '兵庫県上郡町', '兵庫県丹波市', '兵庫県丹波篠山市', '兵庫県伊丹市', '兵庫県佐用町', '兵庫県加古川市', '兵庫県加東市', '兵庫県加西市', '兵庫県南あわじ市', '兵庫県多可町', '兵庫県太子町', '兵庫県姫路

## レイヤー

### LightSE

In [13]:
# LightSEクラスの定義
class LightSE(tf.keras.Model):
    def __init__(self, field_size, embedding_size=32):
        super(LightSE, self).__init__()
        self.field_size = field_size
        self.embedding_size = embedding_size
        self.excitation = tf.keras.Sequential([
            tf.keras.layers.Dense(self.field_size, use_bias=False)
        ])
        self.softmax = tf.keras.layers.Softmax(axis=1)

    def call(self, inputs):
        if len(inputs.shape) != 3:
            raise ValueError(
                f"Unexpected inputs dimensions {len(inputs.shape)}, expect to be 3 dimensions")

        Z = tf.reduce_mean(inputs, axis=-1)
        A = self.excitation(Z)
        A = self.softmax(A)
        out = inputs * tf.expand_dims(A, axis=2)

        return inputs + out



# 例として、embedding_dimsを定義
embedding_dims = {
    'gender_user': 2,
    'gender_target': 2,
    'location_user': 10,
    'location_target': 10,
    'age_range_user': 5,
    'age_range_target': 5,
    'height_range_user': 5,
    'height_range_target': 5,
    'body_type_user': 8,
    'body_type_target': 8,
    'personality_user': 10,
    'personality_target': 10,
    'appearance_user': 8,
    'appearance_target': 8,
    'job_user': 15,
    'job_target': 15,
    'blood_type_user': 4,
    'blood_type_target': 4,
    'car_user': 2,
    'car_target': 2,
    'interests_user': 20,
    'interests_target': 20,
    'salary_user': 10,
    'salary_target': 10
}

# データ読み込み例
# df_sample = pd.read_excel('/path/to/sample_merged_full_10k.xlsx')

# Preprocessorインスタンスの作成
preprocessor = Preprocessor(features, embedding_dims)

# vocabulariesの確認
vocabularies = preprocessor.vocabularies
#print(vocabularies)

# feature_columnsの生成
feature_columns = preprocessor.create_feature_columns()
#print(feature_columns)

# Embedding辞書の作成
embedding_dict = create_embedding_dict(feature_columns)

# データの前処理
processed_data = preprocessor.process(features)

model = LightSE(field_size=12)

try:
    # user_inputsのフィルタリング
    user_inputs = filter_user_inputs(processed_data)

    # スパース特徴量のエンベディングの作成
    sparse_embeddings, dense_values = create_sparse_embeddings(user_inputs, feature_columns, embedding_dict)

    # スパースエンベディングと密な特徴量を連結
    user_dnn_input = tf.concat(sparse_embeddings, axis=-1)

    # データをモデルに通す
    predictions = model(tf.expand_dims(user_dnn_input, axis=1))
    print(predictions)
except KeyError as e:
    print(f"Key error: {e}")
except RuntimeError as e:
    print(f"Runtime error: {e}")

tf.Tensor(
[[[-0.0502103   0.04805514  0.01645458 ... -0.03073121  0.04528201
   -0.0090122 ]
  [-0.05019602  0.04804148  0.0164499  ... -0.03072247  0.04526913
   -0.00900964]
  [-0.05020457  0.04804965  0.0164527  ... -0.0307277   0.04527684
   -0.00901118]
  ...
  [-0.05019338  0.04803895  0.01644904 ... -0.03072086  0.04526675
   -0.00900917]
  [-0.05021456  0.04805922  0.01645598 ... -0.03073382  0.04528585
   -0.00901297]
  [-0.05021363  0.04805833  0.01645567 ... -0.03073325  0.04528501
   -0.0090128 ]]

 [[-0.0502103   0.04805514  0.01645458 ... -0.03073121  0.045282
   -0.00901221]
  [-0.05019602  0.04804148  0.0164499  ... -0.03072247  0.04526913
   -0.00900964]
  [-0.05020457  0.04804965  0.0164527  ... -0.0307277   0.04527684
   -0.00901118]
  ...
  [-0.05019338  0.04803895  0.01644904 ... -0.03072086  0.04526675
   -0.00900917]
  [-0.05021456  0.04805922  0.01645598 ... -0.03073382  0.04528585
   -0.00901297]
  [-0.05021363  0.04805833  0.01645567 ... -0.03073325  0.045285

### DNN

In [32]:
class DNN(tf.keras.Model):
    def __init__(self, layer_sizes: List[int], activation='relu', use_bn: bool = False):
        super(DNN, self).__init__()
        self.use_bn = use_bn
        self.dense_layers = tf.keras.Sequential()

        for layer_size in layer_sizes:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
            if self.use_bn:
                self.dense_layers.add(tf.keras.layers.BatchNormalization())
            self.dense_layers.add(tf.keras.layers.Activation(activation))

        self.output_layer = tf.keras.layers.Dense(1)

    def call(self, inputs):
        x = self.dense_layers(inputs)
        return self.output_layer(x)

# Example usage
model = DNN([64, 128, 256], activation='relu', use_bn=True)

## UserModel

In [39]:
# UserModelの定義
class UserModel(tf.keras.Model):
    def __init__(self, layer_sizes, field_size: int, embedding_size: int, embedding_dict: dict, feature_columns, activation: str='relu', use_bn: bool = False):
        super().__init__()
        self.User_SE = LightSE(field_size=field_size, embedding_size=embedding_size)
        self.feature_columns = feature_columns
        self.embedding_dict = embedding_dict
        self.dnn = DNN(layer_sizes=layer_sizes, activation=activation, use_bn=use_bn)

    def call(self, inputs: dict) -> tf.Tensor:
        # スパース特徴量のエンベディングの作成
        sparse_embeddings, dense_values = create_sparse_embeddings(user_inputs, self.feature_columns, self.embedding_dict)

        # スパースエンベディングと密な特徴量を連結
        user_dnn_input = tf.concat(sparse_embeddings, axis=-1)

        # データをモデルに通す
        dnn_input = self.User_SE(tf.expand_dims(user_dnn_input, axis=1))

        # DNN に通す
        dnn_output = self.dnn(dnn_input)

        return dnn_output

# 例として、embedding_dimsを定義
embedding_dims = {
    'gender_user': 2,
    'gender_target': 2,
    'location_user': 10,
    'location_target': 10,
    'age_range_user': 5,
    'age_range_target': 5,
    'height_range_user': 5,
    'height_range_target': 5,
    'body_type_user': 8,
    'body_type_target': 8,
    'personality_user': 10,
    'personality_target': 10,
    'appearance_user': 8,
    'appearance_target': 8,
    'job_user': 15,
    'job_target': 15,
    'blood_type_user': 4,
    'blood_type_target': 4,
    'car_user': 2,
    'car_target': 2,
    'interests_user': 20,
    'interests_target': 20,
    'salary_user': 10,
    'salary_target': 10
}
layer_sizes = [32, 32, 32]

# データ読み込み例
# df_sample = pd.read_excel('/path/to/sample_merged_full_10k.xlsx')

# Preprocessorインスタンスの作成
preprocessor = Preprocessor(features, embedding_dims)

# vocabulariesの確認
vocabularies = preprocessor.vocabularies
#print(vocabularies)

# feature_columnsの生成
feature_columns = preprocessor.create_feature_columns()
#print(feature_columns)

# Embedding辞書の作成
embedding_dict = create_embedding_dict(feature_columns)

# データの前処理
processed_data = preprocessor.process(features)

model = UserModel(layer_sizes, field_size=12, embedding_size=32, embedding_dict=embedding_dict, feature_columns=feature_columns, use_bn=True)

try:
    # user_inputsのフィルタリング
    user_inputs = filter_user_inputs(processed_data)

    # スパース特徴量のエンベディングの作成
    #sparse_embeddings, dense_values = create_sparse_embeddings(user_inputs, feature_columns, embedding_dict)
    # スパースエンベディングと密な特徴量を連結
    #user_dnn_input = tf.concat(sparse_embeddings, axis=-1)

    # LightSEモデルのインスタンス作成
    #print(f"user_dnn_input shape: {user_dnn_input.shape}")
    #model = LightSE(field_size=user_dnn_input.shape[1])

    # データをモデルに通す
    predictions = model(user_inputs)
    print(predictions)
except KeyError as e:
    print(f"Key error: {e}")
except RuntimeError as e:
    print(f"Runtime error: {e}")

tf.Tensor(
[[[0.00588983]
  [0.00588954]
  [0.00588945]
  ...
  [0.00588934]
  [0.00588943]
  [0.00588966]]

 [[0.00588983]
  [0.00588954]
  [0.00588945]
  ...
  [0.00588934]
  [0.00588943]
  [0.00588966]]

 [[0.00588983]
  [0.00588954]
  [0.00588945]
  ...
  [0.00588934]
  [0.00588943]
  [0.00588966]]

 ...

 [[0.00200697]
  [0.0020069 ]
  [0.00200687]
  ...
  [0.00200683]
  [0.00200686]
  [0.00200693]]

 [[0.00200698]
  [0.0020069 ]
  [0.00200687]
  ...
  [0.00200684]
  [0.00200686]
  [0.00200693]]

 [[0.00200697]
  [0.00200689]
  [0.00200687]
  ...
  [0.00200684]
  [0.00200686]
  [0.00200693]]], shape=(108562, 12, 1), dtype=float32)


## ItemModel

In [40]:
# ItemModelの定義
class ItemModel(tf.keras.Model):
    def __init__(self, layer_sizes, field_size: int, embedding_size: int, embedding_dict: dict, feature_columns, activation: str='relu', use_bn: bool = False):
        super().__init__()
        self.Item_SE = LightSE(field_size=field_size, embedding_size=embedding_size)
        self.embedding_dict = embedding_dict
        self.feature_columns = feature_columns
        self.dnn = DNN(layer_sizes=layer_sizes, activation=activation, use_bn=use_bn)

    def call(self, inputs: dict) -> tf.Tensor:
        # スパース特徴量のエンベディングの作成
        sparse_embeddings, dense_values = create_sparse_embeddings(inputs, self.feature_columns, self.embedding_dict)
        # スパースエンベディングと密な特徴量を連結
        item_dnn_input = tf.concat(sparse_embeddings, axis=-1)
        # データをモデルに通す
        dnn_input = self.Item_SE(tf.expand_dims(item_dnn_input, axis=1))

        # DNN に通す
        dnn_output = self.dnn(dnn_input)

        return dnn_output


layer_sizes = [32, 32, 32]

# データ読み込み例
# df_sample = pd.read_excel('/path/to/sample_merged_full_10k.xlsx')

# Preprocessorインスタンスの作成
preprocessor = Preprocessor(features, embedding_dims)

# vocabulariesの確認
vocabularies = preprocessor.vocabularies
#print(vocabularies)

# feature_columnsの生成
feature_columns = preprocessor.create_feature_columns()
#print(feature_columns)

# Embedding辞書の作成
embedding_dict = create_embedding_dict(feature_columns)

# データの前処理
processed_data = preprocessor.process(features)

model = ItemModel(layer_sizes, field_size=12, embedding_size=32, embedding_dict=embedding_dict, feature_columns=feature_columns, use_bn=True)

try:
    # user_inputsのフィルタリング
    user_inputs = filter_user_inputs(processed_data)

    # スパース特徴量のエンベディングの作成
    #sparse_embeddings, dense_values = create_sparse_embeddings(user_inputs, feature_columns, embedding_dict)
    # スパースエンベディングと密な特徴量を連結
    #user_dnn_input = tf.concat(sparse_embeddings, axis=-1)

    # LightSEモデルのインスタンス作成
    #print(f"user_dnn_input shape: {user_dnn_input.shape}")
    #model = LightSE(field_size=user_dnn_input.shape[1])

    # データをモデルに通す
    predictions = model(user_inputs)
    print(predictions)
except KeyError as e:
    print(f"Key error: {e}")
except RuntimeError as e:
    print(f"Runtime error: {e}")

tf.Tensor(
[[[-0.03202065]
  [-0.03201955]
  [-0.03202061]
  ...
  [-0.03202048]
  [-0.03201958]
  [-0.0320199 ]]

 [[-0.03202065]
  [-0.03201955]
  [-0.03202061]
  ...
  [-0.03202048]
  [-0.03201957]
  [-0.0320199 ]]

 [[-0.03202065]
  [-0.03201955]
  [-0.03202061]
  ...
  [-0.03202048]
  [-0.03201958]
  [-0.0320199 ]]

 ...

 [[-0.03404881]
  [-0.03404821]
  [-0.03404878]
  ...
  [-0.03404871]
  [-0.03404822]
  [-0.03404839]]

 [[-0.03404881]
  [-0.03404821]
  [-0.03404878]
  ...
  [-0.03404872]
  [-0.03404822]
  [-0.03404839]]

 [[-0.03404881]
  [-0.03404821]
  [-0.03404878]
  ...
  [-0.03404872]
  [-0.03404822]
  [-0.03404839]]], shape=(108562, 12, 1), dtype=float32)


## TowTowerModel

In [41]:
import tensorflow as tf

class TwoTowerModel(tf.keras.Model):
    def __init__(self, layer_sizes, field_size: int, embedding_size: int, embedding_dict, feature_columns, activation: str='relu', use_bn: bool = False):
        super().__init__()
        self.user_model = UserModel(layer_sizes, field_size, embedding_size, embedding_dict, feature_columns=feature_columns, activation=activation, use_bn=use_bn)
        self.item_model = ItemModel(layer_sizes, field_size, embedding_size, embedding_dict, feature_columns=feature_columns, activation=activation, use_bn=use_bn)
        self.logit_scale = tf.Variable(1.0, trainable=True)

    def call(self, inputs):
        user_inputs = {key: value for key, value in inputs.items() if key.endswith('_user')}
        target_inputs = {key: value for key, value in inputs.items() if key.endswith('_target')}

        user_embeddings = self.user_model(user_inputs)
        target_embeddings = self.item_model(target_inputs)

        # デバッグ出力
        tf.print("ユーザー埋め込みの形状:", tf.shape(user_embeddings))
        tf.print("ターゲット埋め込みの形状:", tf.shape(target_embeddings))

        # バッチサイズが同じであることを確認する
        user_batch_size = user_embeddings.shape[0]
        target_batch_size = target_embeddings.shape[0]

        if user_batch_size != target_batch_size:
            raise ValueError(f"ユーザーとターゲットの埋め込みのバッチサイズは同じでなければなりませんが、{user_batch_size} と {target_batch_size} が与えられました。")

        logit_scale_exp = tf.exp(self.logit_scale)
        logits = tf.matmul(user_embeddings, target_embeddings, transpose_b=True) * logit_scale_exp
        return logits

# 訓練

In [18]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy

## データセット準備

In [19]:
# Preprocessorインスタンスの作成
preprocessor = Preprocessor(features, embedding_dims)

# vocabulariesの確認
vocabularies = preprocessor.vocabularies
#print(vocabularies)

# feature_columnsの生成
feature_columns = preprocessor.create_feature_columns()
#print(feature_columns)

# Embedding辞書の作成
embedding_dict = create_embedding_dict(feature_columns)

# データの前処理
processed_data = preprocessor.process(features)

dataset = tf.data.Dataset.from_tensor_slices((dict(processed_data), labels))

# データセットを訓練データと検証データに分割
dataset_size = len(data)
train_size = int(0.8 * dataset_size)
val_size = int(0.2 * dataset_size)

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size).take(val_size)

# モデルの訓練に適したようにデータセットをバッチ化
train_dataset = train_dataset.batch(128)
val_dataset = val_dataset.batch(128)

## モデル準備

In [42]:
# Two Towerモデルの定義
layer_sizes = [128, 64, 32]
# クラスの数を指定（例: 3クラス）
#num_classes = 3

field_size = 12
embedding_size = 32
use_bn = True

two_tower_model = TwoTowerModel(
    layer_sizes=layer_sizes,
    field_size=field_size,
    embedding_size=embedding_size,
    embedding_dict=embedding_dict,
    feature_columns=feature_columns,
    use_bn=use_bn
    )

In [21]:
# 損失関数とオプティマイザー
def contrastive_loss(logits):
    batch_size = tf.shape(logits)[0]
    labels = tf.range(batch_size)
    return tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits))

optimizer = Adam(learning_rate=0.001)

# メトリック
train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
train_accuracy_metric = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
val_accuracy_metric = tf.keras.metrics.CategoricalAccuracy(name='val_accuracy')

In [22]:
# 訓練ステップ
@tf.function
def train_step(inputs, labels):
    with tf.GradientTape() as tape:
        logits = two_tower_model(inputs)
        loss = contrastive_loss(logits)
    gradients = tape.gradient(loss, two_tower_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, two_tower_model.trainable_variables))

    train_loss_metric(loss)
    train_accuracy_metric(labels, logits)

# 評価ステップ
@tf.function
def val_step(inputs, labels):
    logits = two_tower_model(inputs)
    loss = contrastive_loss(logits)

    val_loss_metric(loss)
    val_accuracy_metric(labels, logits)

In [None]:
epochs = 300
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

#print(train_dataset)
#print(val_dataset)

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # 訓練データセットの反復
    for batch, labels_batch in train_dataset:
        train_step(batch, labels_batch)

    # 訓練の損失と精度を記録
    train_losses.append(train_loss_metric.result().numpy())
    train_accuracies.append(train_accuracy_metric.result().numpy())

    # 検証データセットの評価
    for val_batch, val_labels_batch in val_dataset:
        val_step(val_batch, val_labels_batch)

    # 検証の損失と精度を記録
    val_losses.append(val_loss_metric.result().numpy())
    val_accuracies.append(val_accuracy_metric.result().numpy())

    if epoch >= 20 and (epoch + 1) % 10 == 0:  # 最初の20エポックはスキップし、10エポックごとにグラフを更新
        # プロット
        clear_output(wait=True)  # 既存のグラフをクリア
        plt.figure(figsize=(12, 4))

        plt.subplot(1, 2, 1)
        plt.plot(train_losses, label='Training Loss')
        plt.plot(val_losses, label='Validation Loss')
        plt.legend(loc='upper right')
        plt.title('Training and Validation Loss')

        plt.subplot(1, 2, 2)
        plt.plot(train_accuracies, label='Training Accuracy')
        plt.plot(val_accuracies, label='Validation Accuracy')
        plt.legend(loc='lower right')
        plt.title('Training and Validation Accuracy')

        plt.show()

    # メトリックのリセット
    train_loss_metric.reset_states()
    train_accuracy_metric.reset_states()
    val_loss_metric.reset_states()
    val_accuracy_metric.reset_states()

# モデルの保存 (必要に応じて)
# two_tower_model.save('two_tower_model')

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
