#### 导入库

In [1]:
import pandas as pd
import numpy as np
import os
from scipy.stats import pearsonr
from scipy.sparse import diags
import matplotlib.pyplot as plt
from tqdm import tqdm
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial']

program = pd.read_csv(r'trans_program.txt', sep='\t') 
col = len(program)
del program
site = '1.1用户编码/'
row = len(os.listdir(site))

#### 1)rca矩阵

In [None]:
def rca_matrix(threshhold):
    weight3 = np.zeros((row, col))  # 行为用户，列为节目
    weight4 = np.zeros((row, col))

    df1 = pd.read_csv(r'2.1节目\节目比例1.txt', sep='\t')
    df2 = pd.read_csv(r'2.1节目\节目比例2.txt', sep='\t')
    site = '1.1用户编码/' 
    for file in os.listdir(site):
        data = pd.read_csv(site+file, sep='\t')  
        # 训练集用户的RCA
        data1 = data.loc[data['date'] == 1].copy()
        if len(data1) > 0:     # 保证i有取值
            data1['时间差'] = data1['时间差']/data1['时间差'].sum()
            data1.rename(columns={'时间差': '时长占用户的比例'}, inplace=True)
            data1 = pd.merge(data1, df1[['节目p','比例']], on='节目p')
            data1['RCA'] = data1['时长占用户的比例']/ data1['比例']
            i = int(data1.iloc[0,0])     
            w1 = data1.loc[data1['RCA'] >= threshhold, '节目p'].to_numpy().astype(int)

        # 测试集用户的RCA
        data2 = data.loc[data['date'] == 2].copy()
        del data
        if len(data2) > 0:
            data2['时间差'] = data2['时间差']/data2['时间差'].sum()
            data2.rename(columns={'时间差': '时长占用户的比例'}, inplace=True)
            data2 = pd.merge(data2, df2[['节目p','比例']], on='节目p')
            data2['RCA'] = data2['时长占用户的比例']/ data2['比例']
            w2 = data2.loc[data2['RCA'] >= threshhold, '节目p'].to_numpy().astype(int)

        weight3 [i,w1] = 1     # i和w1需要是整数
        weight4 [i,w2] = 1
    return weight3,weight4
weight3,weight4 = rca_matrix(1)

#### 2)B 频道矩阵

In [2]:
data = pd.read_csv(r'trans_program.txt', sep='\t')  
data['节目p'] = data.index
data = data[['频道编码','节目p']]  #195个频道,12489个节目
data = data.sort_values(by = '频道编码')
# B 行为频道，列为节目
B = np.zeros((len(data['频道编码'].unique()), len(data['节目p'].unique())))
for i, group in data.groupby('频道编码'):
    w = group['节目p'].to_numpy().astype(int)
    B[i,w] = 1    # B: L*M, A: N*M

#### 物质扩散NBI

In [None]:
# 物质扩散，计算矩阵得分
def mass_diffusion_origin(B):
    N, M = B.shape
    kN = np.sum(B, axis=1)
    kM = np.sum(B, axis=0)
    B1 = B.T
    use1 = np.zeros(N)
    use1[kN != 0] = 1.0 / kN[kN != 0]
    temp = diags(use1, 0, (N, N)).toarray()   # diag为稀疏矩阵
    B1 = B1.dot(temp)
    B2 = B.copy()
    use2 = np.zeros(M)
    use2[kM != 0] = 1.0 / kM[kM != 0]
    temp = diags(use2, 0, (M, M)).toarray()
    B2 = B2.dot(temp)
    return B1, B2

# 计算S1
A1, A2 = mass_diffusion_origin(weight3)
S1 = np.dot(np.dot(weight3, A2.T), A1.T)  
del A1, A2
B1, B2 = mass_diffusion_origin(B)
S2 = np.dot(np.dot(weight3, B2.T), B1.T) 
del B1, B2

#### 协同过滤CF

In [None]:
# 协同过滤，计算矩阵得分
def collaborate_filter(B):
    intersection = (B.T).dot(B)
    union = B.shape[0] - (1-B.T).dot(1-B)
    S = intersection/union
    return S

# 计算S1,S2
S1= collaborate_filter(weight3)
S1 = weight3.dot(S1)
S2 = collaborate_filter(B)
S2 = weight3.dot(S2)

#### 计算rs

In [4]:
def ranking_score_time_mass(weight3,weight4,B,alpha): 
    S = S1 + alpha*S2
    # 计算ranking score
    ranking_score = np.array([])
    dlt = np.array([])   # 需要删除的索引
    for i in range(weight3.shape[0]):    # 遍历每个用户
        a = S[i][weight3[i] == 0]     # 零位置的得分，一维ndarray
        b = weight4[i][weight3[i]== 0]   # 零位置的真实观看情况，0和1
        if not np.all(b == 0):          
            sorted_indices = np.argsort(a)[::-1]
            a = a[sorted_indices]            # 得分从大到小排列
            b = b[sorted_indices]            # 观看情况0、1重排
            a = (np.arange(len(a))+1)/len(a) # 计算每个元素所在的排序/元素总数
            a = a[b != 0]                    # 实际观看节目 其得分的ndarray
            ranking_score = np.append(ranking_score, np.mean(a))
        else:
            dlt = np.append(dlt,i)
    ranking_score_mean = np.mean(ranking_score)
    return ranking_score_mean       

#### 不同α

In [5]:
def alpha(num):
    rs = ranking_score_time_mass(weight3,weight4,B,num)
    data = pd.DataFrame([[num, rs]],columns=['alpha', 'rs'])
    with open('结果.txt', 'a',encoding='utf-8',newline='') as file:
        data.to_csv(file, header=file.tell()==0, sep='\t', index=False)
    file.close()
alpha(2)
alpha(1.8)
alpha(1.6)
alpha(1.4)
alpha(1.2)
alpha(1)
alpha(0.8)
alpha(0.6)
alpha(0.4)
alpha(0.2)
alpha(0)