In [1]:
import tensorflow as tf
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from collections import Counter
import os
import pickle
import re
from tensorflow.python.ops import math_ops

In [2]:
rating_title = ['UserId' , 'MovieID' , 'Rating' , 'timestamps']
ratings = pd.read_table('../data/ratings.dat' , sep='::' , header=None , names=rating_title , engine='python')
ratings.head()

Unnamed: 0,UserId,MovieID,Rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
users_title = ['UserID' , 'Gender' , 'Age' , 'OccupationID' , 'Zip-code']
users = pd.read_table('../data/users.dat' , sep = '::' , header=None , names=users_title , engine='python')
users.head()

Unnamed: 0,UserID,Gender,Age,OccupationID,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [4]:
movie_title = ['MovieID' , 'Title' , 'Genres']
movies = pd.read_table('../data/movies.dat' , sep='::' , header=None , names=movie_title , engine='python')
movies.head(30)

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [25]:
# 读取数据，并且对数据进行预处理
def load_data():
    # 读取用户数据
    users_title = ['UserID' , 'Gender' , 'Age' , 'JobID' , 'Zip-code']
    users = pd.read_table("../data/users.dat",sep = '::',header=None,names=users_title,engine='python')
    users = users.filter(regex = 'UserID|Gender|Age|JobID')  # 按照列进行过滤，只保留这 4 列
    users_origin = users.values  # 把 users的值保存这个副本
    
    # 对用户性别进行处理
    gender_map = {'F':0 , 'M':1}
    users['Gender'] = users['Gender'].map(gender_map)  # 对每个元素进行 map映射
    
    # 对用户年龄进行处理
    age_map = {val:i for i,val in enumerate(set(users['Age']))} # 之所以用 set是因为去除重复
    users['Age'] = users['Age'].map(age_map)   # Age的value 部分是 0-6
    
    # 读取电影数据
    movie_title = ['MovieID' , 'Title' , 'Genres']
    movies = pd.read_table('../data/movies.dat',sep='::',header=None,names=movie_title,engine='python')
    movies_origin = movies.values  # 把 movies的值保存这个副本
    
    # 把 Title中的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')
    title_map = {val:pattern.match(val).group(1) for i,val in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)
    
    # 电影类型转为数字的set
    genres_set = set()
    for val in movies['Genres'].str.split('|'):
        genres_set.update(val) # 修改当前集合，把val包含的元素全部加入到 genres_set中且去除重复
    
    # 长度统一，空白部分用 '<PAD>'填充
    genres_set.add('<PAD>')
    # 将电影类型的 set改为 map  value部分是电影类型所对应的 int
    genres2int = {val:i for i , val in enumerate(genres_set)}
    
    #将电影类型转成等长数字列表，长度是
    # 下面得到的是一个 map, key 是 电影类型（可以组合），value是个List，表示电影类型的 数字组合
    genres_map = {val:[genres2int[row] for row in val.split('|')] for i,val in enumerate(set(movies['Genres']))}

    
    #将电影类型转成等长数字列表，长度是18，因为一共18种电影
    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
    movies['Genres'] = movies['Genres'].map(genres_map)
    
    # # 生成一个set，里面包含了每个Title中每个词
    title_set = set()
    for val in movies['Title'].str.split():   # 没有参数，按照空格换行符制表符等分割
        title_set.update(val)
    title_set.add('<PAD>')
    
    # 然后把每个词转化为 int类型
    title2int = {val:i for i , val in enumerate(title_set)}
    
    #将电影 Title转为等长的数字列表，长度为15 
    title_count = 15
    # key是电影Title，value 是电影Title分割出来的词的int 表示
    title_map = {val:[title2int[row] for row in val.split()] for i,val in enumerate(set(movies['Title']))}
    
    for key in title_map:
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    # 这样就把原先 movies['Title']改为了 List[int]表示
    movies['Title'] = movies['Title'].map(title_map) 
    
    
    # 读取评分数据集
    ratings_title = ['UserID' , 'MovieID' , 'ratings' , 'timestamps']
    ratings = pd.read_table('../data/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')
    
    # 合并三个表
    data = pd.merge(pd.merge(ratings , users) , movies)
    
    #将数据分成X和y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_origin, users_origin    

    
    
    

In [27]:
# 序列化对象，并将结果数据流写入到文件对象中
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()
pickle.dump((title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig), open('../data/preprocess.p', 'wb'))

In [28]:
users.head(30)

Unnamed: 0,UserID,Gender,Age,JobID
0,1,0,0,10
1,2,1,5,16
2,3,1,6,15
3,4,1,2,7
4,5,1,6,20
5,6,0,3,9
6,7,1,1,1
7,8,1,6,12
8,9,1,6,17
9,10,0,1,1


In [29]:
movies.head(30)

Unnamed: 0,MovieID,Title,Genres
0,1,"[3908, 1567, 3415, 3415, 3415, 3415, 3415, 341...","[18, 3, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
1,2,"[764, 3415, 3415, 3415, 3415, 3415, 3415, 3415...","[13, 3, 16, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5..."
2,3,"[1658, 128, 2892, 3415, 3415, 3415, 3415, 3415...","[0, 11, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
3,4,"[2299, 3402, 3768, 3415, 3415, 3415, 3415, 341...","[0, 9, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
4,5,"[5095, 3428, 3461, 13, 4110, 3417, 3415, 3415,...","[0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ..."
5,6,"[3424, 3415, 3415, 3415, 3415, 3415, 3415, 341...","[12, 14, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5..."
6,7,"[3928, 3415, 3415, 3415, 3415, 3415, 3415, 341...","[0, 11, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
7,8,"[3509, 3740, 2329, 3415, 3415, 3415, 3415, 341...","[13, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
8,9,"[1574, 887, 3415, 3415, 3415, 3415, 3415, 3415...","[12, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,..."
9,10,"[339, 3415, 3415, 3415, 3415, 3415, 3415, 3415...","[12, 13, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5..."


In [30]:
movies.values[0]

array([1,
       list([3908, 1567, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415, 3415]),
       list([18, 3, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])],
      dtype=object)

In [31]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = pickle.load(open('../data/preprocess.p', mode='rb'))