In [1]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
JSON_DATA_PATH = "data/reviews.json"
N = 10

In [3]:
import json

def iter_json_data(path):
    with open(path) as f:
        for line in f:
            data = json.loads(line)
            yield data
            
def get_data_frame():
    uid_to_id = {}
    iid_to_id = {}
    
    cols = ["uid", "iid", "review", "rating", "dt", "helpful", "summary"]
    rows = []
    for d in iter_json_data(JSON_DATA_PATH):
        uid = uid_to_id.setdefault(d["reviewerID"], len(uid_to_id))
        iid = iid_to_id.setdefault(d["asin"], len(iid_to_id))
        review = d["reviewText"]
        rating = float(d["overall"])
        dt = int(d["unixReviewTime"])
        helpful = 0 if d["helpful"][1] == 0 else d["helpful"][0]/d["helpful"][1]
        summary = d["summary"]
        rows.append((uid, iid, review, rating, dt, helpful, summary))
        
    return pd.DataFrame(rows, columns=cols)

In [4]:
df = get_data_frame()
df.head()

Unnamed: 0,uid,iid,review,rating,dt,helpful,summary
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,0.666667,Pay to unlock content? I don't think so.
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,0.0,Good rally game
2,2,0,1st shipment received a book instead of the ga...,1.0,1403913600,0.0,Wrong key
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,0.7,"awesome game, if it did not crash frequently !!"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,1.0,DIRT 3


In [5]:
def split_df_by_dt(df, p=0.8):
    """Функция разбивает df на тестовую и тренировочную выборки по времени 
    публикации отзывов (значение времени в поле dt)
    
    :param p: персентиль значений dt, которые образуют тренировочную выборку. Например p=0.8 означает, что в 
    тренировочной части будут отзывы, соответствующие первым 80% временного интервала 
    :return: два pd.DataFrame объекта
    """
    border_dt = df.dt.quantile(p)
    print("Min=%s, border=%s, max=%s" % (df.dt.min(), border_dt, df.dt.max()))
    training_df, test_df  = df[df.dt <= border_dt], df[df.dt > border_dt]
    print("Размер до очистки:", training_df.shape, test_df.shape)
    # удаляем из тестовых данных строки, соответствующие пользователям или объектам, 
    # которых нет в тренировочных данных 
    # (пользователи - избегаем проблем для персональных систем, объекты - для всех)
    test_df = test_df[test_df.uid.isin(training_df.uid) & test_df.iid.isin(training_df.iid)]
    print("Размер после очистки:", training_df.shape, test_df.shape)
    return training_df, test_df

In [6]:
def hit_ratio(recs_dict, test_dict):
    """Функция считает метрику hit-ration для двух словарей
    :recs_dict: словарь рекомендаций типа {uid: {iid: score, ...}, ...}
    :test_dict: тестовый словарь типа {uid: {iid: score, ...}, ...}
    """
    hits = 0
    for uid in test_dict:
#         print(test_dict[uid].keys())
#         print(recs_dict.get(uid, {}))
        if set(test_dict[uid].keys()).intersection(recs_dict.get(uid, {})):
            hits += 1
    return hits / len(test_dict)

In [7]:
training_df, test_df = split_df_by_dt(df)
del df

Min=939859200, border=1377129600.0, max=1405987200
Размер до очистки: (185427, 7) (46353, 7)
Размер после очистки: (185427, 7) (19174, 7)


In [8]:
training_df.head()

Unnamed: 0,uid,iid,review,rating,dt,helpful,summary
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,0.666667,Pay to unlock content? I don't think so.
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,0.0,Good rally game
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,0.7,"awesome game, if it did not crash frequently !!"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,1.0,DIRT 3
5,5,0,"Overall this is a well done racing game, with ...",4.0,1368230400,0.0,"Good racing game, terrible Windows Live Requir..."


In [None]:
from scipy.sparse import csr_matrix
def get_user_item_matrix(df):
    values = []
    rows = []
    cols = []
    
    iid_group_by_user_df = training_df.groupby('uid')['iid'].apply(list).reset_index()
    rating_group_by_user_df = training_df.groupby('uid')['rating'].apply(list).reset_index()
    
    csr = csr_matrix((data, (rows, cols)))
    return csr
    
   

In [47]:
# training_df.pivot(index='uid', columns='iid', values='rating')
# f = {'A':['sum','mean'], 'B':['prod']}
iid_group_by_user_df = training_df.groupby('uid')['iid'].apply(list).reset_index()
rating_group_by_user_df = training_df.groupby('uid')['rating'].apply(list).reset_index()

In [152]:
group_by_user_df = iid_group_by_user_df.merge(rating_group_by_user_df, on=['uid'], how='left')

In [153]:
group_by_user_df.head()

Unnamed: 0,uid,iid,rating
0,0,"[0, 1432, 3081, 3135, 3964, 6598]","[1.0, 5.0, 1.0, 5.0, 1.0, 2.0]"
1,1,"[0, 2223, 4380, 8388, 10211]","[4.0, 5.0, 1.0, 5.0, 2.0]"
2,2,"[3760, 5782, 6898, 7141, 7823, 7930]","[1.0, 3.0, 5.0, 3.0, 5.0, 1.0]"
3,3,"[0, 5363, 6293, 6308, 6339, 6590, 6843, 8069]","[3.0, 5.0, 3.0, 2.0, 5.0, 5.0, 5.0, 4.0]"
4,4,"[0, 5390, 6842, 6944, 7580, 7582, 7823, 7947, ...","[4.0, 5.0, 5.0, 4.0, 3.0, 1.0, 3.0, 5.0, 4.0, ..."


In [156]:
# zipped = list(zip(iid_group_by_user_df.uid, iid_group_by_user_df.iid))
values = []
rows = []
cols = []
uniq_iids = training_df.iid.unique()
for index, row in group_by_user_df.iterrows():
    rows.extend([index] * len(row['iid']))
    cols.extend([np.where(uniq_iids==el)[0][0] for el in row['iid']])
    values.extend([el for el in row['rating']])
csr = csr_matrix((values, (rows, cols)))

In [157]:
csr.todense()

matrix([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 4.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

TypeError: can only concatenate list (not "int") to list

In [112]:
[[(elem[0], el) for el in elem[1]] for elem in zipped][0]

[(0, 0), (0, 1432), (0, 3081), (0, 3135), (0, 3964), (0, 6598)]

In [81]:
lst = list(zipped)

In [89]:
zipped[0][0]

0

In [59]:
a = csr_matrix([training_df['rating'],training_df['uid'],training_df['iid']])

In [61]:
a.todense()

matrix([[  1.00000000e+00,   4.00000000e+00,   3.00000000e+00, ...,
           5.00000000e+00,   5.00000000e+00,   3.00000000e+00],
        [  0.00000000e+00,   1.00000000e+00,   3.00000000e+00, ...,
           2.41300000e+04,   1.72400000e+04,   2.31690000e+04],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           1.03850000e+04,   1.03850000e+04,   1.03850000e+04]])

In [50]:
iid_group_by_user_df.head()

Unnamed: 0,uid,iid
0,0,"[0, 1432, 3081, 3135, 3964, 6598]"
1,1,"[0, 2223, 4380, 8388, 10211]"
2,2,"[3760, 5782, 6898, 7141, 7823, 7930]"
3,3,"[0, 5363, 6293, 6308, 6339, 6590, 6843, 8069]"
4,4,"[0, 5390, 6842, 6944, 7580, 7582, 7823, 7947, ..."


In [22]:
group_by_user_df.iid.values[0]

[0, 1432, 3081, 3135, 3964, 6598]

In [None]:
rows = []
cols = []
    
group_by_user_df = training_df.groupby('uid')['iid'].apply(list).reset_index()
row
csr = csr_matrix((data, (rows, cols)))

In [21]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from scipy.sparse import vstack
def load_data(df):
    rows = []
    cols = []
    data = []
    
    uid_to_row = {}
    iid_to_col = {}
    
    for t in df.itertuples():
        row_id = uid_to_row.setdefault(t.uid, len(uid_to_row))
        col_id = iid_to_col.setdefault(t.iid, len(iid_to_col))
        rating = t.rating
        
        rows.append(row_id)
        cols.append(col_id)
        data.append(rating)    
    ui_m = csr_matrix((data, (rows, cols)))
    return ui_m, uid_to_row, iid_to_col

In [33]:
ui_m, uid_to_row, iid_to_col = load_data(training_df)

In [34]:
from sklearn.preprocessing import normalize
ui_m_normalized = normalize(ui_m, norm='l1', axis=1)

In [37]:
from sklearn.metrics.pairwise import cosine_similarity
ii_sim_m = cosine_similarity(ui_m_normalized.T.tocsr(), dense_output=False)

In [38]:
ii_sim_m.data

array([ 0.18311559,  0.06571599,  0.02357235, ...,  0.00745994,
        0.0050334 ,  0.00473716])

In [39]:
def get_topk(matrix, top, axis=1):
    """Converts source matrix to Top-K matrix
    where each row or column contains only top K values

    :param matrix: source matrix
    :param top: number of top items to be stored
    :param axis: 0 - top by column, 1 - top by row
    :return:
    """
    rows = []
    cols = []
    data = []

    if axis == 0:
        matrix = matrix.T.tocsr()

    for row_id, row in enumerate(matrix):
        if top is not None and row.nnz > top:
            top_args = np.argsort(row.data)[-top:]

            rows += [row_id] * top
            cols += row.indices[top_args].tolist()
            data += row.data[top_args].tolist()
        elif row.nnz > 0:
            rows += [row_id] * row.nnz
            cols += row.indices.tolist()
            data += row.data.tolist()

    topk_m = csr_matrix((data, (rows, cols)), (matrix.shape[0], matrix.shape[1]))

    if axis == 0:
        topk_m = topk_m.T.tocsr()

    return topk_m

In [40]:
# вспомогательные функции, которые могут пригодиться при построении Item-based CF
def nullify_main_diagonal(m):
    positions = range(m.shape[0])
    eye = csr_matrix((np.ones(len(positions)), (positions, positions)), m.shape)
    return m - m.multiply(eye)

In [41]:
ii_sim_m = nullify_main_diagonal(ii_sim_m)

In [42]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))

Density 0.038369689069303436


In [44]:
ii_sim_m = get_topk(ii_sim_m, 10)

In [45]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))

Density 0.0009872255657676183


In [46]:
uid = 0
recs = ui_m_normalized[uid].dot(ii_sim_m.T) 

In [47]:
recs

<1x10098 sparse matrix of type '<class 'numpy.float64'>'
	with 60 stored elements in Compressed Sparse Row format>

In [48]:
for arg_id in np.argsort(recs.data)[-5:][::-1]:
    iid = recs.indices[arg_id]
    score = recs.data[arg_id]
    print(iid, score)

3135 0.177605608707
3964 0.133403027282
1432 0.130515454755
2272 0.115631021861
9701 0.0875753871298
