## 从零开始实现FM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [12]:
import pandas as pd
import numpy as np

from tensorflow.keras import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import *
import tensorflow.keras.backend as K

from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

In [13]:
# dense特征取对数　　sparse特征进行类别编码
def process_feat(data, dense_feats, sparse_feats):
    df = data.copy()
    # dense
    df_dense = df[dense_feats].fillna(0.0)
    for f in tqdm(dense_feats):
        df_dense[f] = df_dense[f].apply(lambda x: np.log(1 + x) if x > -1 else -1)

    # sparse
    df_sparse = df[sparse_feats].fillna('-1')
    for f in tqdm(sparse_feats):
        lbe = LabelEncoder()
        df_sparse[f] = lbe.fit_transform(df_sparse[f])

    df_sparse_arr = []
    for f in tqdm(sparse_feats):
        data_new = pd.get_dummies(df_sparse.loc[:, f].values)
        data_new.columns = [f + "_{}".format(i) for i in range(data_new.shape[1])]
        df_sparse_arr.append(data_new)

    df_new = pd.concat([df_dense] + df_sparse_arr, axis=1)
    return df_new

In [14]:
# FM 特征组合层
class crossLayer(layers.Layer):
    def __init__(self, input_dim, output_dim=10, **kwargs):
        super(crossLayer, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        # 定义交叉特征的权重
        self.kernel = self.add_weight(name='kernel',
                        shape=(self.input_dim, self.output_dim),
                        initializer='glorot_uniform',
                        trainable=True)

    def call(self, x):  # 对照上述公式中的二次项优化公式一起理解
        a = K.pow(K.dot(x, self.kernel), 2)
        b = K.dot(K.pow(x, 2), K.pow(self.kernel, 2))
        return 0.5 * K.mean(a - b, 1, keepdims=True)

In [16]:
# 定义FM模型
def FM(feature_dim):
    inputs = Input(shape=(feature_dim,))

    # 一阶特征
    linear = Dense(units=1,
                   kernel_regularizer=regularizers.l2(0.01),
                   bias_regularizer=regularizers.l2(0.01))(inputs)

    # 二阶特征
    cross = crossLayer(feature_dim)(inputs)
    add = Add()([linear, cross])  # 将一阶特征与二阶特征相加构建FM模型

    pred = Dense(units=1, activation="sigmoid")(add)
    model = Model(inputs=inputs, outputs=pred)

    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizers.Adam(),
                  metrics=['binary_accuracy'])

    return model

In [17]:
# 读取数据
print('loading data...')
data = pd.read_csv('/content/criteo_sample.txt')

loading data...


In [19]:
data.sample(20)

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
51,0,,119,4.0,4.0,13528.0,,0.0,7.0,35.0,...,07c540c4,48ce336b,,,ea6a0e31,,3a171ecb,da408463,,
39,0,10.0,11,3.0,3.0,1026.0,3.0,88.0,3.0,131.0,...,27c07bd6,e88ffc9d,712d530c,b1252a9d,9ecb9e0d,,bcdee96c,a8380e43,cb079c2d,37c5e077
168,1,1.0,2,76.0,4.0,0.0,4.0,1.0,4.0,4.0,...,e5ba7672,38f08461,,,79fe2943,,bcdee96c,325bcd40,,
108,0,,29,4.0,4.0,12245.0,,0.0,19.0,73.0,...,07c540c4,e7648a8f,,,0014c32a,,32c7478e,3b183c5c,,
40,0,,5,22.0,5.0,10324.0,,0.0,5.0,13.0,...,776ce399,2585827d,21ddcdc9,5840adea,a66e7b01,,be7c41b4,e33735a0,e8b83407,f95af538
170,0,1.0,2921,,0.0,48.0,17.0,20.0,10.0,84.0,...,27c07bd6,7ef5affa,21ddcdc9,a458ea53,a716bbe2,,3a171ecb,3fdb382b,001f3601,a39e1586
91,0,,1,4.0,1.0,235065.0,,0.0,3.0,1.0,...,e5ba7672,130ebfcd,,,f15fe1ee,,32c7478e,2896ad66,,
175,0,,8,8.0,12.0,39343.0,1820.0,0.0,19.0,318.0,...,e5ba7672,3ae505af,,,0014c32a,,423fab69,3b183c5c,,
61,0,,0,34.0,3.0,,,0.0,3.0,3.0,...,2005abd1,891589e7,712d530c,b1252a9d,c2af6d9f,,32c7478e,58e38a64,ea9a246c,70451962
25,1,9.0,1,2.0,5.0,18.0,5.0,9.0,5.0,5.0,...,07c540c4,7e32f7a4,,,a4b7004c,,32c7478e,b34f3128,,


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   label   200 non-null    int64  
 1   I1      110 non-null    float64
 2   I2      200 non-null    int64  
 3   I3      166 non-null    float64
 4   I4      165 non-null    float64
 5   I5      194 non-null    float64
 6   I6      149 non-null    float64
 7   I7      190 non-null    float64
 8   I8      200 non-null    float64
 9   I9      190 non-null    float64
 10  I10     110 non-null    float64
 11  I11     190 non-null    float64
 12  I12     43 non-null     float64
 13  I13     165 non-null    float64
 14  C1      200 non-null    object 
 15  C2      200 non-null    object 
 16  C3      191 non-null    object 
 17  C4      191 non-null    object 
 18  C5      200 non-null    object 
 19  C6      168 non-null    object 
 20  C7      200 non-null    object 
 21  C8      200 non-null    object 
 22  C9

In [22]:
# dense 特征开头是I，sparse特征开头是C，Label是标签
cols = data.columns.values
dense_feats = [f for f in cols if f[0] == 'I']
sparse_feats = [f for f in cols if f[0] == 'C']

# 对dense数据和sparse数据分别处理
print('processing features')
feats = process_feat(data, dense_feats, sparse_feats)

# 划分训练和验证数据
x_trn, x_tst, y_trn, y_tst = train_test_split(feats, data['label'], test_size=0.2, random_state=2020)

# 定义模型
model = FM(feats.shape[1])

# 训练模型
model.fit(x_trn, y_trn, epochs=10, batch_size=128, validation_data=(x_tst, y_tst))

processing features


100%|██████████| 13/13 [00:00<00:00, 1121.24it/s]
100%|██████████| 26/26 [00:00<00:00, 1788.97it/s]
100%|██████████| 26/26 [00:00<00:00, 1331.98it/s]


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 213ms/step - binary_accuracy: 0.7557 - loss: 0.5675 - val_binary_accuracy: 0.7500 - val_loss: 0.5935
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - binary_accuracy: 0.7557 - loss: 0.5495 - val_binary_accuracy: 0.7500 - val_loss: 0.5918
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - binary_accuracy: 0.7703 - loss: 0.5251 - val_binary_accuracy: 0.7500 - val_loss: 0.5910
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - binary_accuracy: 0.7729 - loss: 0.5128 - val_binary_accuracy: 0.7500 - val_loss: 0.5899
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - binary_accuracy: 0.7729 - loss: 0.5027 - val_binary_accuracy: 0.7500 - val_loss: 0.5880
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - binary_accuracy: 0.7625 - loss: 0.4966 - val_bina

<keras.src.callbacks.history.History at 0x7c3232eab220>