# 推荐

In [None]:
#用户数目和电影数目
from utils import n_Users, n_Movies

In [None]:
## SVD,基于模型的协同过滤

In [1]:
from __future__ import division  
import numpy as np  
import scipy as sp  
from numpy.random import random  

class  SVD_C:  
    def __init__(self, X, k=20):  
        ''''' 
            k  is the number of latent componets 
        '''  
        self.X = X  
        self.k = k  
        self.mu = np.mean(self.R)  #平均打分
        
        #模型参数初始化
        self.bi={}  
        self.bu={}  
        
        self.qi={}  
        self.pu={}  
        
        self.ItemsForUser={}  #每个Item对应的用户
        self.UsersForItem={}  #每个用户对哪些Item打过分
        
        for i in range(self.X.shape[0]):  
            uid=self.X[i][0]  
            i_id=self.X[i][1]  
            rat=self.X[i][2] 
            
            self.ItemsForUser.setdefault(i_id,{})  
            self.UsersForItem.setdefault(uid,{}) 
            
            self.ItemsForUser[i_id][uid]=rat  
            self.UsersForItem[uid][i_id]=rat  
            
            self.bi.setdefault(i_id,0)  
            self.bu.setdefault(uid,0)  
            
            self.qi.setdefault(i_id,random((self.k,1))/10*(np.sqrt(self.k)))  
            self.pu.setdefault(uid,random((self.k,1))/10*(np.sqrt(self.k)))  
                    
    #根据当前参数，预测用户uid对Item（i_id）的打分
    def pred(self,uid,i_id):  
        self.bi.setdefault(i_id,0)  
        self.bu.setdefault(uid,0)  
        
        self.qi.setdefault(i_id,np.zeros((self.k,1)))  
        self.pu.setdefault(uid,np.zeros((self.k,1)))  
        
        if (self.qi[i_id]==None):  
            self.qi[i_id]=np.zeros((self.k,1))  
        if (self.pu[uid]==None):  
            self.pu[uid]=np.zeros((self.k,1))  
        
        ans=self.mu + self.bi[i_id] + self.bu[uid] + np.sum(self.qi[i_id]*self.pu[uid])  
        
        #将打分范围控制在1-5之间
        if ans>5:  
            return 5  
        elif ans<1:  
            return 1  
        return ans  
    
    #gamma：为学习率
    #Lambda：正则参数
    def train(self,steps=20,gamma=0.04,Lambda=0.15):  
        for step in range(steps):  
            print 'the ',step,'-th  step is running'  
            rmse_sum=0.0 
            
            #将训练样本打散顺序
            kk = np.random.permutation(self.X.shape[0])  
            for j in range(self.X.shape[0]):  
                
                #每次一个训练样本
                i=kk[j]  
                uid=self.X[i][0]  
                i_id=self.X[i][1]  
                rat=self.X[i][2]  
                
                #预测残差
                eui=rat-self.pred(uid,i_id)  
                #残差平方和
                rmse_sum+=eui**2  
                
                #随机梯度下降，更新
                self.bu[uid]+=gamma*(eui-Lambda*self.bu[uid])  
                self.bi[i_id]+=gamma*(eui-Lambda*self.bi[i_id]) 
                
                temp=self.qi[i_id]  
                self.qi[i_id]+=gamma*(eui*self.pu[uid]-Lambda*self.qi[i_id])  
                self.pu[uid]+=gamma*(eui*temp-Lambda*self.pu[uid])  
            
            #学习率递减
            gamma=gamma*0.93  
            print "the rmse of this step on train data is ",np.sqrt(rmse_sum/self.X.shape[0])  
            #self.test(test_data)  
            
    def test(self,test_X):  
        output=[]  
        sums=0  
        test_X=np.array(test_X)  
          
        for i in range(test_X.shape[0]):  
            pre=self.pred(test_X[i][0],test_X[i][1])  
            output.append(pre)  
            sums+=(pre-test_X[i][2])**2  
        rmse=np.sqrt(sums/test_X.shape[0])  
        print "the rmse on test data is ",rmse  
        return output  

## 构建特征

In [9]:
# 这是构建特征部分
from __future__ import division

import cPickle
import numpy as np
import scipy.io as sio

class DataRewriter:
  def __init__(self):
    # 读入数据做初始化
    
    # 评分矩阵
    self.userMovieScores = sio.mmread("PE_userMovieScores").todense()
    
    #对电影打过分的用户
    usersForMovie = cPickle.load(open("PE_usersForMoive.pkl", 'rb'))
    #用户打过分的电影
    moviesForUser = cPickle.load(open("PE_moviesForUser.pkl", 'rb'))
    
    # 根据用户属性计算的用户相似度矩阵
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    
    # 根据电影属性计算的电影相似度矩阵，暂时没有
    #self.moviePropSim = sio.mmread("EV_eventPropSim").todense()
    
  def userReco(self, userId, movieId):
    """
    根据User-based协同过滤，得到movie的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    i = self.userIndex[userId]
    j = self.movietIndex[eventId]
    vs = self.userMovieScores[:, j]
    sims = self.userSimMatrix[i, :]
    prod = sims * vs
    try:
      return prod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      return 0

  def eventReco(self, userId, eventId):
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    for item i 
      for every item j tht u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    
    self.similarity.setdefault(m1,{})  
    self.similarity.setdefault(m2,{})  
        
    self.movie_user.setdefault(m1,{})  
    self.movie_user.setdefault(m2,{})  
    self.similarity[m1].setdefault(m2,-1)  
    self.similarity[m2].setdefault(m1,-1)  
  

    
    i = self.userIndex[userId]
    j = self.eventIndex[eventId]
    js = self.userEventScores[i, :]
    psim = self.eventPropSim[:, j]
    csim = self.eventContSim[:, j]
    pprod = js * psim
    cprod = js * csim
    pscore = 0
    cscore = 0
    try:
      pscore = pprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    try:
      cscore = cprod[0, 0] - self.userEventScores[i, j]
    except IndexError:
      pass
    return pscore, cscore



  def rewriteData(self, start=1, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及基于模型的协同过滤
    作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    fn = "train.csv" if train else "test.csv"
    fin = open(fn, 'rb')
    fout = open("data_" + fn, 'wb')
    # write output header
    if header:
      ocolnames = ["user_reco", "item_reco","svd_reco"]
      if train:
        ocolnames.append("rating")
      fout.write(",".join(ocolnames) + "\n")
    
    ln = 0
    for line in fin:
      ln += 1
      if ln < start:
        continue
      cols = line.strip().split(",")
      userId = cols[0]
      movieId = cols[1]
     
      user_reco = self.userReco(userId, movieId)
      item_reco = self.movieReco(userId, movieId)
      svd_reco = self.svdReco(userId, movieId)
      
      ocols = [ user_reco, movie_reco, svd_reco]
      if train:
        ocols.append(self.userEventScores[userId][movieId]) # rating
      fout.write(",".join(map(lambda x: str(x), ocols)) + "\n")
    fin.close()
    fout.close()

  def rewriteTrainingSet(self):
    self.rewriteData(True)

  def rewriteTestSet(self):
    self.rewriteData(False)

# When running with cython, the actual class will be converted to a .so
# file, and the following code (along with the commented out import below)
# will need to be put into another .py and this should be run.

#import CRegressionData as rd

dr = DataRewriter()
print "生成训练数据...\n"
dr.rewriteData(train=True, start=2, header=True)

print "生成预测数据...\n"
dr.rewriteData(train=False, start=2, header=True)

生成训练数据...

train.csv:500 (userId, eventId)=(123290209, 1887085024)
train.csv:1000 (userId, eventId)=(272886293, 199858305)
train.csv:1500 (userId, eventId)=(395305791, 1582270949)
train.csv:2000 (userId, eventId)=(527523423, 3272728211)
train.csv:2500 (userId, eventId)=(651258472, 792632006)
train.csv:3000 (userId, eventId)=(811791433, 524756826)
train.csv:3500 (userId, eventId)=(985547042, 1269035551)
train.csv:4000 (userId, eventId)=(1107615001, 173949238)
train.csv:4500 (userId, eventId)=(1236336671, 3849306291)
train.csv:5000 (userId, eventId)=(1414301782, 2652356640)
train.csv:5500 (userId, eventId)=(1595465532, 955398943)
train.csv:6000 (userId, eventId)=(1747091728, 2131379889)
train.csv:6500 (userId, eventId)=(1914182220, 955398943)
train.csv:7000 (userId, eventId)=(2071842684, 1076364848)
train.csv:7500 (userId, eventId)=(2217853337, 3051438735)
train.csv:8000 (userId, eventId)=(2338481531, 2525447278)
train.csv:8500 (userId, eventId)=(2489551967, 520657921)
train.csv:9000 (us

## 9.建模与预测
实际上在上述特征构造好了之后，可以用很多办法去训练得到模型和完成预测
这里用了sklearn中的SGDClassifier
事实上xgboost有更好的效果

注意交叉验证

In [10]:
# 建模与预测
from __future__ import division

import math

import numpy as np
import pandas as pd

from sklearn.cross_validation import KFold
from sklearn.linear_model import SGDClassifier

def train():
  """
  在我们得到的特征上训练分类器，target为1(感兴趣)，或者是0(不感兴趣)
  """
  trainDf = pd.read_csv("data_train.csv")
  X = np.matrix(pd.DataFrame(trainDf, index=None,
    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
    "user_pop", "frnd_infl", "evt_pop"]))
  y = np.array(trainDf.interested)
    
  clf = SGDClassifier(loss="log", penalty="l2")
  clf.fit(X, y)
  return clf

def validate():
  """
  10折的交叉验证，并输出交叉验证的平均准确率
  """
  trainDf = pd.read_csv("data_train.csv")
  X = np.matrix(pd.DataFrame(trainDf, index=None,
    columns=["invited", "user_reco", "evt_p_reco", "evt_c_reco",
    "user_pop", "frnd_infl", "evt_pop"]))
  y = np.array(trainDf.interested)
    
  nrows = len(trainDf)
  kfold = KFold(nrows, 10)
  avgAccuracy = 0
  run = 0
  for train, test in kfold:
    Xtrain, Xtest, ytrain, ytest = X[train], X[test], y[train], y[test]
    clf = SGDClassifier(loss="log", penalty="l2")
    clf.fit(Xtrain, ytrain)
    
    accuracy = 0
    ntest = len(ytest)
    for i in range(0, ntest):
      yt = clf.predict(Xtest[i, :])
      if yt == ytest[i]:
        accuracy += 1
    accuracy = accuracy / ntest
    
    print "accuracy (run %d): %f" % (run, accuracy)
    avgAccuracy += accuracy
    run += 1
  print "Average accuracy", (avgAccuracy / run)

def test(clf):
  """
  读取test数据，用分类器完成预测
  """
  origTestDf = pd.read_csv("test.csv")
  users = origTestDf.user
  events = origTestDf.event
    
  testDf = pd.read_csv("data_test.csv")
  fout = open("result.csv", 'wb')
  fout.write(",".join(["user", "event", "outcome", "dist"]) + "\n")
  nrows = len(testDf)
  Xp = np.matrix(testDf)
  yp = np.zeros((nrows, 2))
  for i in range(0, nrows):
    xp = Xp[i, :]
    yp[i, 0] = clf.predict(xp)
    yp[i, 1] = clf.decision_function(xp)
    fout.write(",".join(map(lambda x: str(x), 
      [users[i], events[i], yp[i, 0], yp[i, 1]])) + "\n")
  fout.close()


clf = train()
test(clf)



## 10.生成要提交的文件

In [11]:
# 处理成提交结果的格式
from __future__ import division

import pandas as pd

def byDist(x, y):
  return int(y[1] - x[1])

def generate_submition_file():
  # 输出文件
  fout = open("final_result.csv", 'wb')
  fout.write(",".join(["User", "Events"]) + "\n")
  resultDf = pd.read_csv("result.csv")
  # group remaining user/events
  grouped = resultDf.groupby("user")
  for name, group in grouped:
    user = str(name)
    tuples = zip(list(group.event), list(group.dist), list(group.outcome))
#    tuples = filter(lambda x: x[2]==1, tuples)
    tuples = sorted(tuples, cmp=byDist)
    events = "\"" + str(map(lambda x: x[0], tuples)) + "\""
    fout.write(",".join([user, events]) + "\n")
  fout.close()


generate_submition_file()