In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, evaluate
from sklearn.model_selection import train_test_split

In [2]:
pdf = pd.read_csv('/Users/benko/Downloads/customer_spu.csv')
pdf.head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd
0,30134,307365,1,2,0,0,0
1,30134,307367,1,2,0,0,0
2,30134,307368,1,1,0,0,0
3,30134,350109,1,1,0,0,0
4,30216,362185,1,1,0,0,0


In [3]:
pdf[['try_buy_something','already_buy_something', 'viewed', 'collection', 'reviewd']].describe()

Unnamed: 0,try_buy_something,already_buy_something,viewed,collection,reviewd
count,3851009.0,3851009.0,3851009.0,3851009.0,3851009.0
mean,0.005702402,0.9032448,0.8850995,0.01923236,0.03120299
std,0.08575314,1.648287,29.50185,0.1401574,0.218331
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,1.0,1.0,0.0,0.0
75%,0.0,1.0,1.0,0.0,0.0
max,15.0,338.0,56228.0,9.0,60.0


In [4]:
len(pdf)

3851009

In [5]:
pdf['rating'] = 0.2 * pdf['try_buy_something'] + 0.3 * pdf['already_buy_something'] + 0.1 * pdf['viewed'] + 0.2 * pdf['collection'] + pdf['reviewd']
pdf.head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
0,30134,307365,1,2,0,0,0,0.8
1,30134,307367,1,2,0,0,0,0.8
2,30134,307368,1,1,0,0,0,0.5
3,30134,350109,1,1,0,0,0,0.5
4,30216,362185,1,1,0,0,0,0.5


In [6]:
pdf = pdf[~pdf.customer_id.isin([288942, 318769, 544120, 520906, 544120, 369949, 48632, 369556, 476317, 657795, 280205, 46117])]
pdf['rating'].describe()

count    3.849583e+06
mean     3.937681e-01
std      6.405139e-01
min      1.000000e-01
25%      1.000000e-01
50%      3.000000e-01
75%      4.000000e-01
max      1.295000e+02
Name: rating, dtype: float64

In [7]:
df = pdf[pdf['viewed'] != 0]  #去掉view为0的
df[df['rating'] < 50].rating.describe()  #去掉rating太大的
df.rating.describe()

count    2.069168e+06
mean     3.594249e-01
std      7.738909e-01
min      1.000000e-01
25%      1.000000e-01
50%      1.000000e-01
75%      4.000000e-01
max      1.295000e+02
Name: rating, dtype: float64

In [8]:
df.sort_values('rating', ascending=False).head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
3792798,711659,370549,0,0,1295,0,0,129.5
3792776,711659,366124,0,0,1238,0,0,123.8
2549214,122841,313646,0,0,1139,0,0,113.9
1354471,372170,355506,0,338,8,1,0,102.4
2202294,650572,358861,0,1,918,0,0,92.1


In [9]:
df.sort_values('rating', ascending=True).head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
2778113,269587,303919,0,0,1,0,0,0.1
2620282,170784,358317,0,0,1,0,0,0.1
2620281,170784,357880,0,0,1,0,0,0.1
2620280,170784,357879,0,0,1,0,0,0.1
2620279,170784,356975,0,0,1,0,0,0.1


In [10]:
df[['customer_id', 'spu_id', 'rating']][:1000000].head()

Unnamed: 0,customer_id,spu_id,rating
6,30345,306248,0.3
10,30537,346721,0.6
11,30537,350109,0.3
12,30773,307253,2.3
16,30773,309613,1.4


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2069168 entries, 6 to 3813856
Data columns (total 8 columns):
customer_id              int64
spu_id                   int64
try_buy_something        int64
already_buy_something    int64
viewed                   int64
collection               int64
reviewd                  int64
rating                   float64
dtypes: float64(1), int64(7)
memory usage: 142.1 MB


In [12]:
reader = Reader()
trainData = df[['customer_id', 'spu_id', 'rating']][1000000:]

In [13]:
testset = df[['customer_id', 'spu_id', 'rating']][:1000000]
df.groupby(['customer_id']).size().sort_values(ascending=False).head(10)

customer_id
91426     12990
315347     4136
578245     2731
96505      2663
332374     2311
42669      2221
415749     2179
391159     1472
619147     1457
316503     1412
dtype: int64

In [14]:
train_df_147175 = trainData[(trainData['customer_id'] == 147175) & (trainData['rating'] >= 0.8)]
train_df_147175 = train_df_147175.set_index('spu_id')
train_df_147175.sort_values('rating', ascending=False).head()

Unnamed: 0_level_0,customer_id,rating
spu_id,Unnamed: 1_level_1,Unnamed: 2_level_1


In [15]:
test_df_147175 = testset[(testset['customer_id'] == 147175) & (testset['rating'] >= 0.8)]
test_df_147175 = test_df_147175.set_index('spu_id')
test_df_147175.sort_values('rating', ascending=False).head()

Unnamed: 0_level_0,customer_id,rating
spu_id,Unnamed: 1_level_1,Unnamed: 2_level_1
307275,147175,19.9
304141,147175,19.0
303872,147175,17.4
304149,147175,16.3
303784,147175,16.0


In [16]:
tmp = df[:1000000]
tmp[(tmp['customer_id'] == 147175) & (tmp['rating'] >= 0.8)].sort_values('rating', ascending=False).head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
605065,147175,307275,0,66,1,0,0,19.9
604878,147175,304141,0,63,1,0,0,19.0
604823,147175,303872,0,57,3,0,0,17.4
604881,147175,304149,0,54,1,0,0,16.3
604788,147175,303784,0,53,1,0,0,16.0


测试集和训练集的数据差距很大

In [17]:
test_df_147175 = df['spu_id'].unique()
test_df_147175[:10]

array([306248, 346721, 350109, 307253, 309613, 317635, 333117, 350088,
       352950, 357084])

In [18]:
test_df_147175 = pd.DataFrame(test_df_147175, columns=['spu_id'])
test_df_147175.head()

Unnamed: 0,spu_id
0,306248
1,346721
2,350109
3,307253
4,309613


In [19]:
data = Dataset.load_from_df(trainData, reader)
data.split(n_folds=3)

svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])
trainset = data.build_full_trainset()
svd.fit(trainset)



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8977
MAE:  0.8584
------------
Fold 2
RMSE: 0.9147
MAE:  0.8584
------------
Fold 3
RMSE: 0.9209
MAE:  0.8583
------------
------------
Mean RMSE: 0.9111
Mean MAE : 0.8584
------------
------------


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x10ef6f0f0>

In [20]:
test_df_147175['Estimate_Score'] = test_df_147175['spu_id'].apply(lambda x: svd.predict(147175, x).est)
test_df_147175.sort_values('Estimate_Score', ascending=False).head(20)


Unnamed: 0,spu_id,Estimate_Score
16357,366847,2.583995
198,366124,2.421381
13175,370549,2.307954
13434,313410,1.776875
13329,373093,1.620139
11121,362340,1.493905
18316,360981,1.47656
12041,368924,1.021503
14494,359687,1.0
14493,306571,1.0


In [21]:
test_df_147175.sort_values('Estimate_Score', ascending=False).spu_id.head(20).values.tolist()

[366847,
 366124,
 370549,
 313410,
 373093,
 362340,
 360981,
 368924,
 359687,
 306571,
 349960,
 365861,
 365859,
 370383,
 351787,
 347056,
 312474,
 333780,
 345731,
 309506]

In [22]:
test_df_147175[test_df_147175['spu_id'] == 307275]

Unnamed: 0,spu_id,Estimate_Score
512,307275,1.0


In [23]:
len(df[['customer_id', 'spu_id', 'rating']][1000000:])

1069168

通常用于推荐引擎评价的指标是称为最小均方根误差（Root Mean Squared Error，RMSE）的指标，它首先计算均方误差的平均值然后取其平方根。如果评级在1星到5星这个范围内，而我们得到的RMSE为1.0，那么就意味着我们的预测值和用户给出的真实评价相差了一个星级。

重新对输入数据进行处理
***去掉特殊用户数据
***去掉爆款产品的数据
***在有销量的产品中汇总view 收藏数据

加入评价