In [1]:
# 第 1 部分
# 載入函式庫與資料集
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
import pandas as pd

np.random.seed(123456)
data = pd.read_csv('../Data/ratings.csv')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# 第 2 部分
# 資料清理
def get_data(data):

    # 刪除時間
    data.drop('timestamp', axis=1, inplace=True)
    
    # 抓出所有使用者跟電影索引
    users = data.userId.unique()
    movies = data.movieId.unique()
 
    # 找出新索引跟原始索引的對應關係
    # 原始索引是moveies[i]，會改成i
    moviemap={}
    for i in range(len(movies)):
        moviemap[movies[i]]=i
    usermap={}
    for i in range(len(users)):
        usermap[users[i]]=i
    
    # 根據對應關係，將原始索引更改成新索(連續整數值)
    data.movieId = data.movieId.apply(lambda x: moviemap[x])    
    data.userId = data.userId.apply(lambda x: usermap[x])    
        
    # 打亂資料
    data = data.sample(frac=1.0).reset_index(drop=True)
    
    # 建立訓練資料、測試資料集
    train, test = train_test_split(data, test_size=0.2)
    
    n_users = len(users)
    n_movies = len(movies)

    return train, test, n_users, n_movies

train, test, n_users, n_movies = get_data(data)

## 用堆疊法來集成多個神經網路。基學習器是三個神經網路，嵌入層的輸出維度分別是5、10、15。超學習器為貝氏Ridge迴歸。

In [4]:
# 第 3 部分
# 定義基學習器的相關函式
def create_model(n_features=5, 
                 train_model=True, 
                 load_weights=False):

    fts = n_features # 嵌入層的輸出維度
    
    # 輸入層接收資料
    # 嵌入層將資料轉換成n維矩陣
    # 展平層將n維矩陣拉直成陣列
    
    # 處理電影索引的嵌入層
    movie_in = Input(shape=[1], name="Movie")
    mov_embed = Embedding(n_movies, fts, name="Movie_Embed")(movie_in)
    flat_movie = Flatten(name="FlattenM")(mov_embed)
    
    # 處理使用者索引的嵌入層
    user_in = Input(shape=[1], name="User")
    user_inuser_embed = Embedding(n_users, fts, name="User_Embed")(user_in)
    flat_user = Flatten(name="FlattenU")(user_inuser_embed)
    
    # 將兩個嵌入層的輸出值串接之後餵入密集層
    concat = Concatenate()([flat_movie, flat_user])
    dense_1 = Dense(128)(concat)
    dense_2 = Dense(32)(dense_1)
    out = Dense(1)(dense_2)
    
    # 編譯模型
    model = Model([user_in, movie_in], out)
    model.compile('adam', 'mean_squared_error')
    
    # 訓練神經網路
    model.fit([train.userId, train.movieId], train.rating, epochs=10, verbose=1)
    
    return model

def predictions(model):
    # 評估神經網路
    preds = model.predict([test.userId, test.movieId])
    return preds


In [8]:
# type(preds5)

numpy.ndarray

In [7]:
# preds5.shape

(20168, 1)

In [9]:
# preds10.shape

(20168, 1)

In [12]:
# preds15.shape

(20168, 1)

In [13]:
# np.stack([preds5, preds10, preds15],axis=-1).shape

(20168, 1, 3)

In [15]:
# np.stack([preds5, preds10, preds15],axis=-1)[0]

array([[3.7907405, 3.9538522, 3.7271705]], dtype=float32)

In [24]:
# np.stack([preds5, preds10, preds15],axis=-1)[0].shape

(1, 3)

In [19]:
# np.stack([preds5, preds10, preds15],axis=-1).shape

(20168, 1, 3)

In [28]:
# np.stack([preds5, preds10, preds15],axis=-1)

array([[[3.7907405, 3.9538522, 3.7271705]],

       [[3.967098 , 4.132909 , 4.319345 ]],

       [[3.9918156, 4.1864166, 4.029768 ]],

       ...,

       [[3.6014662, 3.8807626, 3.6360133]],

       [[4.7140737, 4.8692975, 4.8886924]],

       [[3.246836 , 3.240417 , 3.1273239]]], dtype=float32)

In [20]:
# np.stack([preds5, preds10, preds15],axis=-1).reshape(-1, 3)

array([[3.7907405, 3.9538522, 3.7271705],
       [3.967098 , 4.132909 , 4.319345 ],
       [3.9918156, 4.1864166, 4.029768 ],
       ...,
       [3.6014662, 3.8807626, 3.6360133],
       [4.7140737, 4.8692975, 4.8886924],
       [3.246836 , 3.240417 , 3.1273239]], dtype=float32)

In [22]:
# np.stack([preds5, preds10, preds15],axis=-1).reshape(-1, 3).shape

(20168, 3)

In [23]:
# np.stack([preds5, preds10, preds15],axis=-1).reshape(-1, 3)[0]

array([3.7907405, 3.9538522, 3.7271705], dtype=float32)

In [25]:
# np.stack([preds5, preds10, preds15],axis=-1).reshape(-1, 3)[0].shape

(3,)

In [27]:
# 第 4 部分
# 初始化基學習器

model5 = create_model(5)
model10 = create_model(10)
model15 = create_model(15)

# 使用基學習器對測試資料進行預測
preds5 = predictions(model5)
preds10 = predictions(model10)
preds15 = predictions(model15)

# 整合預測值成為超學習器的訓練資料
preds = np.stack([preds5, preds10, preds15],axis=-1).reshape(-1, 3)  # (20168, 1, 3)->(20168, 3)

In [38]:
preds.shape

(20168, 3)

In [39]:
preds

array([[3.7907405, 3.9538522, 3.7271705],
       [3.967098 , 4.132909 , 4.319345 ],
       [3.9918156, 4.1864166, 4.029768 ],
       ...,
       [3.6014662, 3.8807626, 3.6360133],
       [4.7140737, 4.8692975, 4.8886924],
       [3.246836 , 3.240417 , 3.1273239]], dtype=float32)

In [44]:
# 第 5 部分
# 訓練超學習器
from sklearn.linear_model import BayesianRidge
meta_learner = BayesianRidge()
meta_learner.fit(preds[:-1000], test.rating[:-1000])

BayesianRidge()

In [45]:
# 第 5 部分
# 訓練超學習器
from sklearn.linear_model import BayesianRidge
meta_learner = BayesianRidge()
meta_learner.fit(preds[:-1000], test.rating[:-1000])

# 用最後 1000 筆資料來評估集成後效能
print('Base Learner 5 Features')
print(metrics.mean_squared_error(test.rating[-1000:], preds5[-1000:]))
print('Base Learner 10 Features')
print(metrics.mean_squared_error(test.rating[-1000:], preds10[-1000:]))
print('Base Learner 15 Features')
print(metrics.mean_squared_error(test.rating[-1000:], preds15[-1000:]))
print('Ensemble')
print(metrics.mean_squared_error(test.rating[-1000:], meta_learner.predict(preds[-1000:])))

Base Learner 5 Features
0.7573521303212949
Base Learner 10 Features
0.7976369724866421
Base Learner 15 Features
0.774595203453332
Ensemble
0.7584831311054545


## 集成後效能可以勝過單一基學習器，得到最低的均方誤差