In [1]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import os
import pickle
import re
from tensorflow.python.ops import math_ops

In [2]:
rating_title = ['UserId' , 'MovieID' , 'Rating' , 'timestamps']
ratings = pd.read_table('../data/ratings.dat' , sep='::' , header=None , names=rating_title , engine='python')
ratings.head()

Unnamed: 0,UserId,MovieID,Rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
users_title = ['UserID' , 'Gender' , 'Age' , 'OccupationID' , 'Zip-code']
users = pd.read_table('../data/users.dat' , sep = '::' , header=None , names=users_title , engine='python')
users.head()

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
movie_title = ['MovieID' , 'Title' , 'Genres']
movies = pd.read_table('../data/movies.dat' , sep='::' , header=None , names=movie_title , engine='python')
movies.head(30)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [5]:
# 读取数据，并且对数据进行预处理
def load_data():
    # 读取用户数据
    users_title = ['UserID' , 'Gender' , 'Age' , 'JobID' , 'Zip-code']
    users = pd.read_table("../data/users.dat",sep = '::',header=None,names=users_title,engine='python')
    users = users.filter(regex = 'UserID|Gender|Age|JobID')  # 按照列进行过滤，只保留这 4 列
    users_origin = users.values  # 把 users的值保存这个副本
    
    # 对用户性别进行处理
    gender_map = {'F':0 , 'M':1}
    users['Gender'] = users['Gender'].map(gender_map)  # 对每个元素进行 map映射
    
    # 对用户年龄进行处理
    age_map = {val:i for i,val in enumerate(set(users['Age']))} # 之所以用 set是因为去除重复
    users['Age'] = users['Age'].map(age_map)   # Age的value 部分是 0-6
    
    # 读取电影数据
    movie_title = ['MovieID' , 'Title' , 'Genres']
    movies = pd.read_table('../data/movies.dat',sep='::',header=None,names=movie_title,engine='python')
    movies_origin = movies.values  # 把 movies的值保存这个副本
    
    # 把 Title中的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')
    title_map = {val:pattern.match(val).group(1) for i,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)
    
    # 电影类型转为数字的set
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val) # 修改当前集合，把val包含的元素全部加入到 genres_set中且去除重复
    
    # 长度统一，空白部分用 '<PAD>'填充
    genres_set.add('<PAD>')
    # 将电影类型的 set改为 map  value部分是电影类型所对应的 int
    genres2int = {val:i for i , val in enumerate(genres_set)}
    
    #将电影类型转成等长数字列表，长度是
    # 下面得到的是一个 map, key 是 电影类型（可以组合），value是个List，表示电影类型的 数字组合
    genres_map = {val:[genres2int[row] for row in val.split('|')] for i,val in enumerate(set(movies['Genres']))}

    
    #将电影类型转成等长数字列表，长度是18，因为一共18种电影
    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
    movies['Genres'] = movies['Genres'].map(genres_map)
    
    # # 生成一个set，里面包含了每个Title中每个词
    title_set = set()
    for val in movies['Title'].str.split():   # 没有参数，按照空格换行符制表符等分割
        title_set.update(val)
    title_set.add('<PAD>')
    
    # 然后把每个词转化为 int类型
    title2int = {val:i for i , val in enumerate(title_set)}
    
    #将电影 Title转为等长的数字列表，长度为15 
    title_count = 15
    # key是电影Title，value 是电影Title分割出来的词的int 表示
    title_map = {val:[title2int[row] for row in val.split()] for i,val in enumerate(set(movies['Title']))}
    
    for key in title_map:
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    # 这样就把原先 movies['Title']改为了 List[int]表示
    movies['Title'] = movies['Title'].map(title_map) 
    
    
    # 读取评分数据集
    ratings_title = ['UserID' , 'MovieID' , 'ratings' , 'timestamps']
    ratings = pd.read_table('../data/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')
    
    # 合并三个表
    data = pd.merge(pd.merge(ratings , users) , movies)
    
    #将数据分成X和y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_origin, users_origin    

    
    
    

In [6]:
# 序列化对象，并将结果数据流写入到文件对象中
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('../data/preprocess.p', 'wb'))

In [7]:
users.head(30)

Unnamed: 0,UserID,Gender,Age,JobID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20
5,6,0,3,9
6,7,1,1,1
7,8,1,6,12
8,9,1,6,17
9,10,0,1,1


In [8]:
movies.head(30)

Unnamed: 0,MovieID,Title,Genres
0,1,"[3628, 3451, 3784, 3784, 3784, 3784, 3784, 378...","[3, 2, 16, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
1,2,"[4002, 3784, 3784, 3784, 3784, 3784, 3784, 378...","[4, 2, 11, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
2,3,"[1203, 3012, 5158, 3784, 3784, 3784, 3784, 378...","[16, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
3,4,"[5165, 1346, 3859, 3784, 3784, 3784, 3784, 378...","[16, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
4,5,"[1948, 415, 3070, 4076, 1127, 3889, 3784, 3784...","[16, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
5,6,"[2396, 3784, 3784, 3784, 3784, 3784, 3784, 378...","[8, 12, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7..."
6,7,"[1264, 3784, 3784, 3784, 3784, 3784, 3784, 378...","[16, 5, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."
7,8,"[723, 453, 5190, 3784, 3784, 3784, 3784, 3784,...","[4, 2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
8,9,"[1889, 156, 3784, 3784, 3784, 3784, 3784, 3784...","[8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
9,10,"[2361, 3784, 3784, 3784, 3784, 3784, 3784, 378...","[8, 4, 10, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,..."


In [9]:
movies.values[0]

array([1,
       list([3628, 3451, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784, 3784]),
       list([3, 2, 16, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])],
      dtype=object)

In [10]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('../data/preprocess.p', mode='rb'))

In [11]:
import tensorflow as tf
import os 
import pickle

def save_params(params):
    """
    Save parameters to file
    """
    pickle.dump(params, open('params.p', 'wb'))


def load_params():
    """
    Load parameters from file
    """
    return pickle.load(open('params.p', mode='rb'))

In [12]:
# 嵌入矩阵的维度
embed_dim = 32
# 用户ID个数,take()是ndarray的方法，第1个参数的列，第二个参数是axis，表示得到列中对应的各个行组成的 ndarray
uid_max = max(features.take(0,1)) + 1 # 6040
# 性别个数
gender_max = max(features.take(2,1)) + 1 # 1 + 1 = 2
#年龄类别个数
age_max = max(features.take(3,1)) + 1 # 6 + 1 = 7
#职业个数
job_max = max(features.take(4,1)) + 1# 20 + 1 = 21

#电影ID个数
movie_id_max = max(features.take(1,1)) + 1 # 3952
#电影类型个数
movie_categories_max = max(genres2int.values()) + 1 # 18 + 1 = 19
#电影名单词个数
movie_title_max = len(title_set) # 5216

#对电影类型嵌入向量做加和操作的标志，考虑过使用mean做平均，但是没实现mean
combiner = "sum"

#电影名长度
sentences_size = title_count # = 15
#文本卷积滑动窗口，分别滑动2, 3, 4, 5个单词
window_sizes = {2, 3, 4, 5}
#文本卷积核数量
filter_num = 8

#电影ID转下标的字典，数据集中电影ID跟下标不一致，比如第5行的数据电影ID不一定是5
movieid2idx = {val[0]:i for i, val in enumerate(movies.values)}


In [13]:
# Number of Epochs
num_epochs = 5
# Batch Size
batch_size = 256

dropout_keep = 0.5
# Learning Rate
learning_rate = 0.0001
# Show stats for every n number of batches
show_every_n_batches = 20

save_dir = './save'

In [14]:
def get_inputs():
    uid = tf.placeholder(tf.int32, [None, 1], name="uid")
    user_gender = tf.placeholder(tf.int32, [None, 1], name="user_gender")
    user_age = tf.placeholder(tf.int32, [None, 1], name="user_age")
    user_job = tf.placeholder(tf.int32, [None, 1], name="user_job")
    
    movie_id = tf.placeholder(tf.int32, [None, 1], name="movie_id")
    movie_categories = tf.placeholder(tf.int32, [None, 18], name="movie_categories")
    movie_titles = tf.placeholder(tf.int32, [None, 15], name="movie_titles")
    targets = tf.placeholder(tf.int32, [None, 1], name="targets")
    LearningRate = tf.placeholder(tf.float32, name = "LearningRate")
    dropout_keep_prob = tf.placeholder(tf.float32, name = "dropout_keep_prob")
    return uid, user_gender, user_age, user_job, movie_id, movie_categories, movie_titles, targets, LearningRate, dropout_keep_prob

In [15]:
def get_user_embedding(uid, user_gender, user_age, user_job):
    with tf.name_scope("user_embedding"):   # 经过这个就少了很多列了
        # 下面两句是先取均匀的随机数，然后根据 uid取相应的列
        uid_embed_matrix = tf.Variable(tf.random_uniform([uid_max, embed_dim], -1, 1), name = "uid_embed_matrix")
        uid_embed_layer = tf.nn.embedding_lookup(uid_embed_matrix, uid, name = "uid_embed_layer")
        
        # 下面三个的列数目都只有 16 而不是 32  然后行数 是各不相同的
        gender_embed_matrix = tf.Variable(tf.random_uniform([gender_max, embed_dim // 2], -1, 1), name= "gender_embed_matrix")
        gender_embed_layer = tf.nn.embedding_lookup(gender_embed_matrix, user_gender, name = "gender_embed_layer")
        
        age_embed_matrix = tf.Variable(tf.random_uniform([age_max, embed_dim // 2], -1, 1), name="age_embed_matrix")
        age_embed_layer = tf.nn.embedding_lookup(age_embed_matrix, user_age, name="age_embed_layer")
        
        job_embed_matrix = tf.Variable(tf.random_uniform([job_max, embed_dim // 2], -1, 1), name = "job_embed_matrix")
        job_embed_layer = tf.nn.embedding_lookup(job_embed_matrix, user_job, name = "job_embed_layer")
    return uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer

In [16]:
def get_user_feature_layer(uid_embed_layer, gender_embed_layer, age_embed_layer, job_embed_layer):
    with tf.name_scope("user_fc"):
        #第一层全连接，输出维度（也就是这层的节点数目）为 embed_dim
        uid_fc_layer = tf.layers.dense(uid_embed_layer, embed_dim, name = "uid_fc_layer", activation=tf.nn.relu)
        gender_fc_layer = tf.layers.dense(gender_embed_layer, embed_dim, name = "gender_fc_layer", activation=tf.nn.relu)
        age_fc_layer = tf.layers.dense(age_embed_layer, embed_dim, name ="age_fc_layer", activation=tf.nn.relu)
        job_fc_layer = tf.layers.dense(job_embed_layer, embed_dim, name = "job_fc_layer", activation=tf.nn.relu)
        
        #第二层全连接,所以这里是 4*32=128
        user_combine_layer = tf.concat([uid_fc_layer, gender_fc_layer, age_fc_layer, job_fc_layer], 2)  #(?, 1, 128)
        user_combine_layer = tf.contrib.layers.fully_connected(user_combine_layer, 200, tf.tanh)  #(?, 1, 200)
    
        user_combine_layer_flat = tf.reshape(user_combine_layer, [-1, 200])
    return user_combine_layer, user_combine_layer_flat

In [17]:

def get_movie_id_embed_layer(movie_id):
    with tf.name_scope("movie_embedding"):
        movie_id_embed_matrix = tf.Variable(tf.random_uniform([movie_id_max, embed_dim], -1, 1), name = "movie_id_embed_matrix")
        movie_id_embed_layer = tf.nn.embedding_lookup(movie_id_embed_matrix, movie_id, name = "movie_id_embed_layer")
    return movie_id_embed_layer

In [18]:
def get_movie_categories_layers(movie_categories):
    with tf.name_scope("movie_categories_layers"):
        movie_categories_embed_matrix = tf.Variable(tf.random_uniform([movie_categories_max, embed_dim], -1, 1), name = "movie_categories_embed_matrix")
        movie_categories_embed_layer = tf.nn.embedding_lookup(movie_categories_embed_matrix, movie_categories, name = "movie_categories_embed_layer")
        if combiner == "sum":
            # axis=1就是按行求和，有几行结果就有几个元素，可以理解为数据压缩
            movie_categories_embed_layer = tf.reduce_sum(movie_categories_embed_layer, axis=1, keep_dims=True)
    #     elif combiner == "mean":

    return movie_categories_embed_layer