In [1]:
# 将所有特征串联起来，构成FE_Train.csv
#FE_Test.csv
#为最后推荐系统做准备
from __future__ import division

import pickle
import numpy as np
import scipy.io as sio

In [32]:
class DataRewriter:
  def __init__(self):
    # 读入数据做初始化
    # 读入在train和test中存在的user和event重组后的Index
    self.userIndex = pickle.load(open("PE_userIndex.pkl", 'rb'))
    self.eventIndex = pickle.load(open("PE_eventIndex.pkl", 'rb'))
    
    # 读入train中每个用户对每个event的打分
    self.userEventScores = sio.mmread("PE_userEventScores").todense()
    
    # 读入用户间的相关性矩阵
    self.userSimMatrix = sio.mmread("US_userSimMatrix").todense()
    
    # 分别读入词频和非词频特性的事件相关性矩阵
    self.eventPropSim = sio.mmread("EV_eventPropSim").todense()
    self.eventContSim = sio.mmread("EV_eventContSim").todense()
    
    # 读入每个User的朋友数量向量
    self.numFriends = sio.mmread("UF_numFriends")
    # 读入对每个user朋友参加活动的平均分的矩阵（对称）
    self.userFriends = sio.mmread("UF_userFriends").todense()
    
    # 读入event受欢迎程度矩阵
    self.eventPopularity = sio.mmread("EA_eventPopularity").todense()
    
  def userReco(self, userId, eventId):
    """
    根据User-based协同过滤，得到event的推荐度
    基本的伪代码思路如下：
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    """
    if (userId in self.userIndex and eventId in self.eventIndex):
        i_user = self.userIndex[userId]
        i_event = self.eventIndex[eventId]
        userFriendsSim = self.userSimMatrix[i_user, :]
        mostSimFriends = dict()
        for friend_index in range(0, len(userFriendsSim.A[0])):
            if (userFriendsSim.A[0, friend_index] >= 0.8 and friend_index != i_user):
                mostSimFriends[friend_index] = userFriendsSim.A[0, friend_index]
                
        if len(mostSimFriends) > 0:
            simSum = 0
            scoreSum = 0
            for friend_index, friend_sim in mostSimFriends.items():
                eventsForUser = self.userEventScores[friend_index, :]
                friedAvargeScore = eventsForUser.sum() / np.shape(eventsForUser)[1]
                scoreSum += friend_sim * (self.userEventScores[friend_index, i_event] - friedAvargeScore)
                simSum += friend_sim
                
            if simSum != 0:
                return scoreSum/simSum
    
    return 0

  def eventReco(self, userId, eventId):
    """
    根据基于物品的协同过滤，得到Event的推荐度
    基本的伪代码思路如下：
    for item i 
      for every item j tht u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    """
    pscore = 0
    cscore = 0
    
    if (userId in self.userIndex and eventId in self.eventIndex):
        i_user = self.userIndex[userId]
        i_event = self.eventIndex[eventId]
        currentEventPropSim = self.eventPropSim[i_event, :]
        currentEventContSim = self.eventContSim[i_event, :]
        eventMostPropEvent = dict()
        eventMostContEvent = dict()
        for event_index in range(0, len(currentEventPropSim.A[0])):
            if (currentEventPropSim.A[0, event_index] > 0.8 and event_index != i_event):
                eventMostPropEvent[event_index] = currentEventPropSim.A[0, event_index]
        
        for event_index_2 in range(0, len(currentEventContSim.A[0])):
            if (currentEventContSim.A[0, event_index_2] > 0.8 and event_index_2 != i_event):
                eventMostContEvent[event_index_2] = currentEventContSim.A[0, event_index_2]
                
        
        if (len(eventMostPropEvent) > 0):
            propSimSum = 0
            propScoreSum = 0
            for propIndex, propSim in eventMostPropEvent.items():
                socreForEvent = self.userEventScores[:, propIndex]
                propAvargeScore = socreForEvent.sum() / np.shape(socreForEvent)[0]
                propScoreSum += propSim * (self.userEventScores[i_user, propIndex] - propAvargeScore)
                propSimSum += propSim
                
            if propSimSum != 0:
                pscore = propScoreSum/propSimSum
            else :
                pscore = 0
            
        
        if (len(eventMostContEvent) > 0):
            contSimSum = 0
            contScoreSum = 0
            for contIndex, contSim in eventMostPropEvent.items():
                socreForEvent = self.userEventScores[:, contIndex]
                contAvargeScore = socreForEvent.sum() / np.shape(socreForEvent)[0]
                contScoreSum += contSim * (self.userEventScores[i_user, contIndex] - contAvargeScore)
                contSimSum += contSim
                
            if contSimSum != 0:
                cscore = contScoreSum/contSimSum
            else :
                cscore = 0
        
    return pscore, cscore

    
  def ModelReco(self, userId, eventId):
        #请自行补充基于模型的协同过滤
        #SVD++/LFM
    
    return 0


  def userPop(self, userId):
    """
    基于用户的朋友个数来推断用户的社交程度
    主要的考量是如果用户的朋友非常多，可能会更倾向于参加各种社交活动
    """
    if userId in self.userIndex:
      i = self.userIndex[userId]
      try:
        return self.numFriends[0, i]
      except IndexError:
        return 0
    else:
      return 0


  def friendInfluence(self, userId):
    """
    朋友对用户的影响
    主要考虑用户所有的朋友中，有多少是非常喜欢参加各种社交活动/event的
    用户的朋友圈如果都积极参与各种event，可能会对当前用户有一定的影响
    """
    nusers = np.shape(self.userFriends)[1]
    i = self.userIndex[userId]
    return (self.userFriends[i, :].sum(axis=0) / nusers)[0,0]

  def eventPop(self, eventId):
    """
    本活动本身的热度
    主要是通过参与的人数来界定的
    """
    i = self.eventIndex[eventId]
    return self.eventPopularity[i, 0]

    
  def rewriteData(self, start=1, train=True, header=True):
    """
    把前面user-based协同过滤 和 item-based协同过滤，以及各种热度和影响度作为特征组合在一起
    生成新的训练数据，用于分类器分类使用
    """
    fn = "train.csv" if train else "test.csv"
    fin = open(fn, 'rb')
    fout = open("data_" + fn, 'wb')
    # write output header
    if header:
      ocolnames = ["invited", "user_reco", "evt_p_reco",
        "evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
      if train:
        ocolnames.append("interested")
        ocolnames.append("not_interested")
      fout.write(str.encode(",".join(ocolnames)  + "\n"))
    ln = 0
    for line in fin:
      ln += 1
      if ln < start:
        continue
      cols = line.strip().decode().split(",")
      userId = cols[0]
      eventId = cols[1]
      invited = cols[2]
      if ln%500 == 0:
          print("%s:%d (userId, eventId)=(%s, %s)" % (fn, ln, userId, eventId))
      user_reco = self.userReco(userId, eventId)
      evt_p_reco, evt_c_reco = self.eventReco(userId, eventId)
      user_pop = self.userPop(userId)
      frnd_infl = self.friendInfluence(userId)
      evt_pop = self.eventPop(eventId)
      ocols = [invited, user_reco, evt_p_reco,
        evt_c_reco, user_pop, frnd_infl, evt_pop]
      if train:
        ocols.append(cols[4]) # interested
        ocols.append(cols[5]) # not_interested
      fout.write(str.encode(",".join(map(lambda x: str(x), ocols)) + "\n"))
    fin.close()
    fout.close()

  def rewriteTrainingSet(self):
    self.rewriteData(True)

  def rewriteTestSet(self):
    self.rewriteData(False)



In [33]:
dr = DataRewriter()
print("生成训练数据...\n")
dr.rewriteData(train=True, start=2, header=True)

print("生成预测数据...\n") 
dr.rewriteData(train=False, start=2, header=True)

生成训练数据...

train.csv:500 (userId, eventId)=(123290209, 1887085024)
train.csv:1000 (userId, eventId)=(272886293, 199858305)
train.csv:1500 (userId, eventId)=(395305791, 1582270949)
train.csv:2000 (userId, eventId)=(527523423, 3272728211)
train.csv:2500 (userId, eventId)=(651258472, 792632006)
train.csv:3000 (userId, eventId)=(811791433, 524756826)
train.csv:3500 (userId, eventId)=(985547042, 1269035551)
train.csv:4000 (userId, eventId)=(1107615001, 173949238)
train.csv:4500 (userId, eventId)=(1236336671, 3849306291)
train.csv:5000 (userId, eventId)=(1414301782, 2652356640)
train.csv:5500 (userId, eventId)=(1595465532, 955398943)
train.csv:6000 (userId, eventId)=(1747091728, 2131379889)
train.csv:6500 (userId, eventId)=(1914182220, 955398943)
train.csv:7000 (userId, eventId)=(2071842684, 1076364848)
train.csv:7500 (userId, eventId)=(2217853337, 3051438735)
train.csv:8000 (userId, eventId)=(2338481531, 2525447278)
train.csv:8500 (userId, eventId)=(2489551967, 520657921)
train.csv:9000 (us