In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD, evaluate
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/Users/zbin/Downloads/tmp1.csv')
df.head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd
0,29999,301821,2,2,1,0,0
1,29999,362469,1,1,0,0,0
2,30122,307288,1,1,0,0,0
3,30134,307365,1,2,0,0,0
4,30134,307367,1,2,0,0,0


In [3]:
len(df)

4418613

In [5]:
df['rating'] = 0.2 * df['try_buy_something'] + 0.3 * df['already_buy_something'] + 0.1 * df['viewed'] + 0.2 * df['collection'] + df['reviewd']
df.head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
0,29999,301821,2,2,1,0,0,1.1
1,29999,362469,1,1,0,0,0,0.5
2,30122,307288,1,1,0,0,0,0.5
3,30134,307365,1,2,0,0,0,0.8
4,30134,307367,1,2,0,0,0,0.8


In [6]:
df.sort_values('rating', ascending=False).head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
1406918,288942,358835,0,9566,0,0,0,2869.8
1407007,288942,359065,0,8512,0,0,0,2553.6
1407273,288942,360680,0,6240,0,0,0,1872.0
1404237,288942,337812,0,6218,0,0,0,1865.4
1405469,288942,349914,0,5692,0,0,0,1707.6


In [7]:
df = df[~df.customer_id.isin([288942, 318769, 544120, 520906, 544120, 369949, 48632, 369556, 476317, 657795, 280205])]
df['rating'].describe()

count    4.379072e+06
mean     3.506792e-01
std      5.198375e-01
min      1.000000e-01
25%      1.000000e-01
50%      3.000000e-01
75%      3.000000e-01
max      9.870000e+01
Name: rating, dtype: float64

In [8]:
df.sort_values('rating', ascending=False).head()

Unnamed: 0,customer_id,spu_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
1722100,372170,355506,0,328,1,1,0,98.7
1299043,265617,312763,0,282,1,0,0,84.7
1629706,344967,305645,0,86,1,0,58,83.9
236641,57763,307414,0,240,1,0,1,73.1
236395,57763,303996,0,240,1,0,0,72.1


In [9]:
df[['customer_id', 'spu_id', 'rating']][:100000].head()

Unnamed: 0,customer_id,spu_id,rating
0,29999,301821,1.1
1,29999,362469,0.5
2,30122,307288,0.5
3,30134,307365,0.8
4,30134,307367,0.8


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4379072 entries, 0 to 4418612
Data columns (total 8 columns):
customer_id              int64
spu_id                   int64
try_buy_something        int64
already_buy_something    int64
viewed                   int64
collection               int64
reviewd                  int64
rating                   float64
dtypes: float64(1), int64(7)
memory usage: 300.7 MB


In [11]:
reader = Reader()

# get just top 100K rows for faster run time
data = Dataset.load_from_df(df[['customer_id', 'spu_id', 'rating']][:100000], reader)
data.split(n_folds=3)

svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8925
MAE:  0.6434
------------
Fold 2
RMSE: 0.8222
MAE:  0.6389
------------
Fold 3
RMSE: 0.8954
MAE:  0.6432
------------
------------
Mean RMSE: 0.8701
Mean MAE : 0.6418
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8925079685433555,
                             0.8222362912882945,
                             0.8954161392807537],
                            'mae': [0.6433636980339961,
                             0.6388706230446449,
                             0.6431858000405348]})

In [25]:
data = Dataset.load_from_df(df[['customer_id', 'spu_id', 'rating']], reader)
trainset = data.build_full_trainset()

In [None]:
svd.fit(trainset)

In [12]:
df_30134 = df[(df['customer_id'] == 30134) & (df['rating'] >= 0.8)]
df_30134 = df_30134.set_index('spu_id')
df_30134.head()

Unnamed: 0_level_0,customer_id,try_buy_something,already_buy_something,viewed,collection,reviewd,rating
spu_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
307365,30134,1,2,0,0,0,0.8
307367,30134,1,2,0,0,0,0.8
307377,30134,0,3,0,0,0,0.9


In [14]:
df_30134 = df['spu_id'].unique()
df_30134[:10]

array([301821, 362469, 307288, 307365, 307367, 307368, 350109, 362185,
       367245, 306248])

In [15]:
df_30134 = pd.DataFrame(df_30134, columns=['spu_id'])
df_30134.head()

Unnamed: 0,spu_id
0,301821
1,362469
2,307288
3,307365
4,307367


In [28]:
df_30134['Estimate_Score'] = df_30134['spu_id'].apply(lambda x: svd.predict(30134, x).est)
df_30134 = df_30134.sort_values('Estimate_Score', ascending=False)
df_30134.head(20)

Unnamed: 0,spu_id,Estimate_Score
29969,352124,3.295737
15101,348680,1.51949
19146,324292,1.460266
1634,351907,1.326389
5290,100030,1.295925
