In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Concatenate

# 假设参数
num_users = 1000
num_items = 1000
num_age_groups = 5
num_categories = 10
embedding_size = 64

# 输入层
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')
user_age_input = Input(shape=(1,), name='user_age_input')
item_category_input = Input(shape=(1,), name='item_category_input')

# 嵌入层
user_embedding = Embedding(num_users, embedding_size, name='user_embedding')(user_input)
item_embedding = Embedding(num_items, embedding_size, name='item_embedding')(item_input)
user_age_embedding = Embedding(num_age_groups, embedding_size, name='user_age_embedding')(user_age_input)
item_category_embedding = Embedding(num_categories, embedding_size, name='item_category_embedding')(item_category_input)

# 扁平化嵌入向量
user_vec = Flatten()(user_embedding)
item_vec = Flatten()(item_embedding)
user_age_vec = Flatten()(user_age_embedding)
item_category_vec = Flatten()(item_category_embedding)

# 合并向量
concat = Concatenate()([user_vec, item_vec, user_age_vec, item_category_vec])

# 多层感知机
mlp = Dense(128, activation='relu')(concat)
mlp = Dense(64, activation='relu')(mlp)
mlp = Dense(32, activation='relu')(mlp)

# 输出层
output = Dense(1, activation='sigmoid')(mlp)

# 构建和编译模型
model = Model(inputs=[user_input, item_input, user_age_input, item_category_input], outputs=output)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

# 模拟数据集
num_samples = 10000
user_data = np.random.randint(0, num_users, num_samples)
item_data = np.random.randint(0, num_items, num_samples)
user_age_data = np.random.randint(0, num_age_groups, num_samples)
item_category_data = np.random.randint(0, num_categories, num_samples)
labels = np.random.randint(0, 2, num_samples)

# 训练模型
model.fit([user_data, item_data, user_age_data, item_category_data], labels, epochs=10, batch_size=32)


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_age_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 item_category_input (InputLaye  [(None, 1)]         0           []                               
 r)                                                                                           

<keras.callbacks.History at 0x21654c4b2d0>

In [4]:
import numpy as np
import pandas as pd

# 定义数据集参数
num_samples = 100000  # 展示训练的样本数量
num_users = 1000
num_items = 1000
num_age_groups = 5
num_categories = 10
num_genders = 2  # 假设有两种性别
num_days_of_week = 7  # 一周7天
num_hours_of_day = 24  # 一天24小时

# 生成模拟数据
user_data = np.random.randint(0, num_users, num_samples)
item_data = np.random.randint(0, num_items, num_samples)
user_age_data = np.random.randint(0, num_age_groups, num_samples)
item_category_data = np.random.randint(0, num_categories, num_samples)
user_gender_data = np.random.randint(0, num_genders, num_samples)
purchase_day_of_week_data = np.random.randint(0, num_days_of_week, num_samples)
purchase_hour_of_day_data = np.random.randint(0, num_hours_of_day, num_samples)
labels = np.random.randint(0, 2, num_samples)

# 创建DataFrame
df = pd.DataFrame({
    'User_ID': user_data,
    'Item_ID': item_data,
    'User_Age_Group': user_age_data,
    'Item_Category': item_category_data,
    'User_Gender': user_gender_data,
    'Purchase_Day_of_Week': purchase_day_of_week_data,
    'Purchase_Hour_of_Day': purchase_hour_of_day_data,
    'Label': labels
})

df



Unnamed: 0,User_ID,Item_ID,User_Age_Group,Item_Category,User_Gender,Purchase_Day_of_Week,Purchase_Hour_of_Day,Label
0,174,360,0,2,0,6,17,1
1,989,396,2,7,1,2,5,1
2,245,489,3,2,0,6,5,1
3,478,700,2,8,1,3,6,1
4,825,78,1,5,1,1,12,0
...,...,...,...,...,...,...,...,...
99995,587,772,4,5,1,5,8,1
99996,910,585,2,9,0,0,4,1
99997,615,549,4,2,1,4,9,0
99998,164,841,1,6,1,4,0,0


In [5]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, Input, Dense, Flatten, Concatenate

def build_ncf_model(df, embedding_size=64):
    # 从DataFrame中提取特征的数量
    num_users = df['User_ID'].nunique()
    num_items = df['Item_ID'].nunique()
    num_age_groups = df['User_Age_Group'].nunique()
    num_categories = df['Item_Category'].nunique()
    num_genders = df['User_Gender'].nunique()
    num_days_of_week = df['Purchase_Day_of_Week'].nunique()
    num_hours_of_day = df['Purchase_Hour_of_Day'].nunique()

    # 输入层
    user_input = Input(shape=(1,), name='user_input')
    item_input = Input(shape=(1,), name='item_input')
    user_age_input = Input(shape=(1,), name='user_age_input')
    item_category_input = Input(shape=(1,), name='item_category_input')
    user_gender_input = Input(shape=(1,), name='user_gender_input')
    day_of_week_input = Input(shape=(1,), name='day_of_week_input')
    hour_of_day_input = Input(shape=(1,), name='hour_of_day_input')
    
    # 嵌入层
    user_embedding = Embedding(num_users, embedding_size, name='user_embedding')(user_input)
    item_embedding = Embedding(num_items, embedding_size, name='item_embedding')(item_input)
    user_age_embedding = Embedding(num_age_groups, embedding_size, name='user_age_embedding')(user_age_input)
    item_category_embedding = Embedding(num_categories, embedding_size, name='item_category_embedding')(item_category_input)
    user_gender_embedding = Embedding(num_genders, embedding_size, name='user_gender_embedding')(user_gender_input)
    day_of_week_embedding = Embedding(num_days_of_week, embedding_size, name='day_of_week_embedding')(day_of_week_input)
    hour_of_day_embedding = Embedding(num_hours_of_day, embedding_size, name='hour_of_day_embedding')(hour_of_day_input)

    # 扁平化嵌入向量
    user_vec = Flatten()(user_embedding)
    item_vec = Flatten()(item_embedding)
    user_age_vec = Flatten()(user_age_embedding)
    item_category_vec = Flatten()(item_category_embedding)
    user_gender_vec = Flatten()(user_gender_embedding)
    day_of_week_vec = Flatten()(day_of_week_embedding)
    hour_of_day_vec = Flatten()(hour_of_day_embedding)

    # 合并向量
    concat = Concatenate()([user_vec, item_vec, user_age_vec, item_category_vec, user_gender_vec, day_of_week_vec, hour_of_day_vec])

    # 多层感知机
    mlp = Dense(128, activation='relu')(concat)
    mlp = Dense(64, activation='relu')(mlp)
    mlp = Dense(32, activation='relu')(mlp)

    # 输出层
    output = Dense(1, activation='sigmoid')(mlp)

    # 构建模型
    model = Model(inputs=[user_input, item_input, user_age_input, item_category_input, user_gender_input, day_of_week_input, hour_of_day_input], outputs=output)

    return model

# 使用示例
ncf_model = build_ncf_model(df, embedding_size=64)
ncf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
ncf_model.summary()


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 item_input (InputLayer)        [(None, 1)]          0           []                               
                                                                                                  
 user_age_input (InputLayer)    [(None, 1)]          0           []                               
                                                                                                  
 item_category_input (InputLaye  [(None, 1)]         0           []                               
 r)                                                                                         

In [6]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [7]:
def extract_features_and_labels(df):
    return [df['User_ID'].values, df['Item_ID'].values, df['User_Age_Group'].values, df['Item_Category'].values, df['User_Gender'].values, df['Purchase_Day_of_Week'].values, df['Purchase_Hour_of_Day'].values], df['Label'].values

train_features, train_labels = extract_features_and_labels(train_df)
test_features, test_labels = extract_features_and_labels(test_df)


In [8]:
ncf_model.fit(train_features, train_labels, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21654bc7110>

In [9]:
loss, accuracy = ncf_model.evaluate(test_features, test_labels)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.49755001068115234


Unnamed: 0,User_ID,Item_ID,User_Age_Group,Item_Category,User_Gender,Purchase_Day_of_Week,Purchase_Hour_of_Day,Label
33429,470,646,0,1,0,0,17,0
5013,942,779,2,2,0,4,23,0
7572,321,219,0,2,1,3,19,0
64127,343,524,1,2,1,4,19,1
6370,216,204,0,2,1,0,13,0
66840,525,187,4,6,1,2,15,1
48541,74,840,1,8,0,3,0,1
11649,339,930,0,8,0,2,4,1
26224,391,293,3,9,1,4,15,0
89378,293,397,3,5,0,4,23,1


In [14]:

predict_df = test_df.sample(10)

# 将预测DataFrame转换为模型输入格式
predict_features = [predict_df['User_ID'].values, predict_df['Item_ID'].values, predict_df['User_Age_Group'].values, predict_df['Item_Category'].values, predict_df['User_Gender'].values, predict_df['Purchase_Day_of_Week'].values, predict_df['Purchase_Hour_of_Day'].values]

# 进行预测
predictions = ncf_model.predict(predict_features)

# 将预测结果与物品ID结合
predictions_with_item_id = zip(predict_df['Item_ID'], predictions.flatten())

# 输出每个物品的预测概率
for item_id, pred in predictions_with_item_id:
    print(f"Item ID: {item_id}, Predicted Probability: {pred}")


Item ID: 769, Predicted Probability: 0.09922812134027481
Item ID: 874, Predicted Probability: 0.5258159041404724
Item ID: 676, Predicted Probability: 0.34037068486213684
Item ID: 808, Predicted Probability: 0.40855318307876587
Item ID: 804, Predicted Probability: 0.13756631314754486
Item ID: 88, Predicted Probability: 0.25788238644599915
Item ID: 863, Predicted Probability: 0.33737021684646606
Item ID: 247, Predicted Probability: 0.673025906085968
Item ID: 148, Predicted Probability: 0.21877515316009521
Item ID: 107, Predicted Probability: 0.4380616843700409


In [15]:
predictions

array([[0.09922812],
       [0.5258159 ],
       [0.34037068],
       [0.40855318],
       [0.13756631],
       [0.2578824 ],
       [0.33737022],
       [0.6730259 ],
       [0.21877515],
       [0.43806168]], dtype=float32)