## python实现推荐系统的简单例子

-- 基于内容的物品相似

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 1、读取数据

In [2]:
# 定义表头
unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('data/users.dat',sep = '::',header=None,names = unames)

In [None]:
users.head(5)

In [3]:
# 导入评价数据
rating_names = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('data/ratings.dat',sep='::',header=None,names = rating_names)

In [None]:
ratings.head(5)

In [4]:
movie_names = ['movie_id','title','genres']
movies = pd.read_table('data/movies.dat',sep='::',header=None,names = movie_names)

In [None]:
movies.head(5)

## 2、合并数据

In [5]:
data = pd.merge(pd.merge(users,ratings),movies)

In [6]:
data.head(10)

Unnamed: 0,user_id,gender,age,occupation,zip,movie_id,rating,timestamp,title,genres
0,1,F,1,10,48067,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,M,56,16,70072,1193,5,978298413,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,M,25,12,32793,1193,4,978220179,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,M,25,7,22903,1193,4,978199279,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,M,50,1,95350,1193,5,978158471,One Flew Over the Cuckoo's Nest (1975),Drama
5,18,F,18,3,95825,1193,4,978156168,One Flew Over the Cuckoo's Nest (1975),Drama
6,19,M,1,10,48073,1193,5,982730936,One Flew Over the Cuckoo's Nest (1975),Drama
7,24,F,25,7,10023,1193,5,978136709,One Flew Over the Cuckoo's Nest (1975),Drama
8,28,F,25,1,14607,1193,3,978125194,One Flew Over the Cuckoo's Nest (1975),Drama
9,33,M,45,3,55421,1193,5,978557765,One Flew Over the Cuckoo's Nest (1975),Drama


In [None]:
# 查看id为1的用户的所有信息
data[data.user_id == 1]

In [None]:
data.describe()

In [None]:
data.to_csv('data/data.csv')

## 3、评分和评分次数

In [None]:
# 创建一个包含每部电影的平均评分和被评分次数的dataframe，用来计算电影间的相似度
ratings = pd.DataFrame(data.groupby('title')['rating'].mean())
ratings.head()

In [None]:
# 为评分次数设置阈值，毕竟如果只有一个人给一部电影评为5的话，把该电影定为5分是不合理的
ratings['number_of_ratings'] = data.groupby('title')['rating'].count()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
ratings['rating'].hist(bins = 50)

In [None]:
ratings['number_of_ratings'].hist(bins = 60)

In [None]:
# 探索电影评分和被评分次数之间的关系;散点图
import seaborn as sns
sns.jointplot(x = 'rating',y='number_of_ratings',data = ratings)

In [None]:
# 透视表，创建电影矩阵，用来计算电影间的相似度
movie_matrix = data.pivot_table(index = 'user_id',columns = 'title',values = 'rating')
movie_matrix.head()

In [None]:
# sort_values，按评分次数降序，查看分数前10的电影;当前ratings有三个字段：title，rating，number_of_ratings
ratings.sort_values('number_of_ratings',ascending = False).head(10)

In [None]:
# 假设莫用户看过'Air Force One(1997)' 和 'Contact (1997)'，根据电影之间的相似度，推荐与之最相似的电影
AFO_user_rating = movie_matrix['Air Force One (1997)']
contact_user_rating = movie_matrix['Contact (1997)']

In [None]:
AFO_user_rating.head()

In [None]:
contact_user_rating.head()

In [None]:
# 所有用户对每个一电影的评价向量之间的相似性比较
similar_to_air_force_one = movie_matrix.corrwith(AFO_user_rating)
similar_to_air_force_one.head()

In [None]:
similar_to_contact = movie_matrix.corrwith(contact_user_rating)
similar_to_contact.head()

In [None]:
# 由于矩阵中有很多null值，我们将删除他们，并将correlation results转换成dataframe
corr_contact = pd.DataFrame(similar_to_contact,columns = ['Correlation'])
corr_contact.dropna(inplace = True)
corr_contact.head()
corr_AFO = pd.DataFrame(similar_to_air_force_one,columns = ['Correlation'])
corr_AFO.dropna(inplace = True)
corr_AFO.head()

In [None]:
# 利用number_of_ratings列将两个dataframe连接起来
corr_AFO = corr_AFO.join(ratings['number_of_ratings'])
corr_contact = corr_contact.join(ratings['number_of_ratings'])
corr_AFO.head()
corr_contact.head()

In [None]:
corr_AFO[corr_AFO['number_of_ratings'] > 100].sort_values(by = 'Correlation',ascending = False).head(10)

In [None]:
corr_contact[corr_contact['number_of_ratings'] > 100].sort_values(by = 'Correlation',ascending = False).head(10)

## 4、改进

- 基于记忆的协同过滤
- 划分数据为训练集和测试集
- 余弦相似度来计算电影之间的相似度
- 基于模型的协同过滤系统，处理伸缩性和稀疏性
- 均方根误差对模型评估
- 当数据量过于大，结合深度学习构建推荐系统
- 自动编码器和受限boltzmann

## 5、方法总结和改善

参考链接：

- [基于记忆与基于模型的推荐系统对比](https://blog.csdn.net/wushandinghua/article/details/52693864)

1、基于内容的推荐系统
- 用户之间相似
- 产品之间相似

2、基于模型的推荐系统
- 预测用户对于从未见过的产品的喜爱程度