In [81]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
%matplotlib inline
import matplotlib.pyplot as plt

In [82]:
JSON_DATA_PATH = "data/reviews.json"
N = 10

In [83]:
import json

def iter_json_data(path):
    with open(path) as f:
        for line in f:
            data = json.loads(line)
            yield data
            
def get_data_frame():
    uid_to_id = {}
    iid_to_id = {}
    
    cols = ["uid", "iid", "review", "rating", "dt", "helpful", "summary"]
    rows = []
    for d in iter_json_data(JSON_DATA_PATH):
        uid = uid_to_id.setdefault(d["reviewerID"], len(uid_to_id))
        iid = iid_to_id.setdefault(d["asin"], len(iid_to_id))
        review = d["reviewText"]
        rating = float(d["overall"])
        dt = int(d["unixReviewTime"])
        helpful = 0 if d["helpful"][1] == 0 else d["helpful"][0]/d["helpful"][1]
        summary = d["summary"]
        rows.append((uid, iid, review, rating, dt, helpful, summary))
        
    return pd.DataFrame(rows, columns=cols)

In [84]:
df = get_data_frame()
df.head()

Unnamed: 0,uid,iid,review,rating,dt,helpful,summary
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,0.666667,Pay to unlock content? I don't think so.
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,0.0,Good rally game
2,2,0,1st shipment received a book instead of the ga...,1.0,1403913600,0.0,Wrong key
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,0.7,"awesome game, if it did not crash frequently !!"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,1.0,DIRT 3


In [85]:
def split_df_by_dt(df, p=0.8):
    """Функция разбивает df на тестовую и тренировочную выборки по времени 
    публикации отзывов (значение времени в поле dt)
    
    :param p: персентиль значений dt, которые образуют тренировочную выборку. Например p=0.8 означает, что в 
    тренировочной части будут отзывы, соответствующие первым 80% временного интервала 
    :return: два pd.DataFrame объекта
    """
    border_dt = df.dt.quantile(p)
    print("Min=%s, border=%s, max=%s" % (df.dt.min(), border_dt, df.dt.max()))
    training_df, test_df  = df[df.dt <= border_dt], df[df.dt > border_dt]
    print("Размер до очистки:", training_df.shape, test_df.shape)
    # удаляем из тестовых данных строки, соответствующие пользователям или объектам, 
    # которых нет в тренировочных данных 
    # (пользователи - избегаем проблем для персональных систем, объекты - для всех)
    test_df = test_df[test_df.uid.isin(training_df.uid) & test_df.iid.isin(training_df.iid)]
    print("Размер после очистки:", training_df.shape, test_df.shape)
    return training_df, test_df

In [86]:
def hit_ratio(recs_dict, test_dict):
    """Функция считает метрику hit-ration для двух словарей
    :recs_dict: словарь рекомендаций типа {uid: {iid: score, ...}, ...}
    :test_dict: тестовый словарь типа {uid: {iid: score, ...}, ...}
    """
    hits = 0
    for uid in test_dict:
#         print(test_dict[uid].keys())
#         print(recs_dict.get(uid, {}))
        if set(test_dict[uid].keys()).intersection(recs_dict.get(uid, {})):
            hits += 1
    return hits / len(test_dict)

In [87]:
training_df, test_df = split_df_by_dt(df)
del df

Min=939859200, border=1377129600.0, max=1405987200
Размер до очистки: (185427, 7) (46353, 7)
Размер после очистки: (185427, 7) (19174, 7)


In [88]:
training_df.head()

Unnamed: 0,uid,iid,review,rating,dt,helpful,summary
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,0.666667,Pay to unlock content? I don't think so.
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,0.0,Good rally game
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,0.7,"awesome game, if it did not crash frequently !!"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,1.0,DIRT 3
5,5,0,"Overall this is a well done racing game, with ...",4.0,1368230400,0.0,"Good racing game, terrible Windows Live Requir..."


In [89]:
from scipy.sparse import find
class BasicRecommender(object):
    def __init__(self):
        pass
    
    def get_recs(self, uid, top):
        """Строит рекомендации для пользователя uid
        :return: словарь типа {iid: score, ...}
        """
        return {}
    
    def get_batch_recs(self, uids, top):
        """Строит рекомендации для нескольких пользователей uids
        :return: словарь типа {uid: {iid: score, ...}, ...}
        """
        return {uid: self.get_recs(uid, top) for uid in uids}

In [100]:
class ItemBasedCollaborativeFilteringRS(BasicRecommender):
    
    
    def __init__(self, df):
        super(ItemBasedCollaborativeFilteringRS, self).__init__()
        self.recs = self._prepare_recs(df)
    
    def get_recs(self, uid, top):
        
        index = np.where(self.rows == uid)[0]
        iid_dict = {}
        has_iid = list(self.group_by_user_df[self.group_by_user_df.uid == uid].iid)[0]
        
        for arg_id in np.argsort(self.recs[index].data)[-top*3:][::-1]:
            iid = self.recs[index].indices[arg_id]
            score = self.recs[index].data[arg_id]
            if not iid is has_iid and len(iid_dict) < top:
                iid_dict[self.cols[iid]] = score


        return iid_dict
    
    def get_batch_recs(self, uids, top):
#         non_pers_recs = self.get_recs(None, top)
        return {uid: self.get_recs(uid, top) for uid in uids}
    
    def _prepare_recs(self, df):
        self.csr, self.rows, self.cols = get_user_item_matrix(df)
        csr_normalized = normalize(self.csr, norm='l1', axis=1)
        self.similarities_sparse = get_topk(nullify_main_diagonal(cosine_similarity(
            csr_normalized.T.tocsr(), dense_output=False)), 50)

#         row, col, self.data = find(self.similarities_sparse)
        recs = csr_normalized.dot(self.similarities_sparse.T)
        iid_group_by_user_df = df.groupby('uid')['iid'].apply(list).reset_index()
        rating_group_by_user_df = df.groupby('uid')['rating'].apply(list).reset_index()
        self.group_by_user_df = iid_group_by_user_df.merge(rating_group_by_user_df, on=['uid'], how='left')
        
        return recs

In [101]:
def get_topk(matrix, top, axis=1):
    """Converts source matrix to Top-K matrix
    where each row or column contains only top K values

    :param matrix: source matrix
    :param top: number of top items to be stored
    :param axis: 0 - top by column, 1 - top by row
    :return:
    """
    rows = []
    cols = []
    data = []

    if axis == 0:
        matrix = matrix.T.tocsr()

    for row_id, row in enumerate(matrix):
        if top is not None and row.nnz > top:
            top_args = np.argsort(row.data)[-top:]

            rows += [row_id] * top
            cols += row.indices[top_args].tolist()
            data += row.data[top_args].tolist()
        elif row.nnz > 0:
            rows += [row_id] * row.nnz
            cols += row.indices.tolist()
            data += row.data.tolist()

    topk_m = csr_matrix((data, (rows, cols)), (matrix.shape[0], matrix.shape[1]))

    if axis == 0:
        topk_m = topk_m.T.tocsr()

    return topk_m

In [102]:
# вспомогательные функции, которые могут пригодиться при построении Item-based CF
def nullify_main_diagonal(m):
    positions = range(m.shape[0])
    eye = csr_matrix((np.ones(len(positions)), (positions, positions)), m.shape)
    return m - m.multiply(eye)

In [103]:
from scipy.sparse import csr_matrix
# генерация матрицы user-item 
def get_user_item_matrix(df):
    values = []
    rows = []
    cols = []
        
    uniq_iids = df.iid.unique()
    
    iid_group_by_user_df = df.groupby('uid')['iid'].apply(list).reset_index()
    rating_group_by_user_df = df.groupby('uid')['rating'].apply(list).reset_index()
    # собираем список итемов с рейтингами пример uid = 0	 iid [0, 1432, 3081, 3135, 3964, 6598]	 
    # rating[1.0, 5.0, 1.0, 5.0, 1.0, 2.0]    
    group_by_user_df = iid_group_by_user_df.merge(rating_group_by_user_df, on=['uid'], how='left')
    
    for index, row in group_by_user_df.iterrows():
        rows.extend([index] * len(row['iid']))
        cols.extend([np.where(uniq_iids==el)[0][0] for el in row['iid']])
        values.extend([el for el in row['rating']])
        
    csr = csr_matrix((values, (rows, cols)))
    
    # вернем не только матрицу, но и списки уникальных id для item и user, чтобы потом понимать окаких пользователях 
    # и играх идет речь
    
    return csr, group_by_user_df.uid, uniq_iids 

In [104]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

def get_sim_item_matrix(csr):
    # нормализуем сначала матрицу user-item и построим матрицу схожести
    csr_normalized = normalize(csr, norm='l1', axis=1)
    similarities_csr = get_topk(nullify_main_diagonal(cosine_similarity(csr_normalized.T.tocsr(),
                                                                        dense_output=False)), 30)
    return similarities_csr 

In [105]:
def get_test_dict(test_df):
    """Функция, конвертирующая тестовый df в словарь
    """
    test_dict = {}
    for t in test_df.itertuples():
        test_dict.setdefault(t.uid, {})
        test_dict[t.uid][t.iid] = t.rating
    return test_dict

In [106]:
test_dict = get_test_dict(test_df)

In [107]:
ibcf = ItemBasedCollaborativeFilteringRS(training_df)

In [108]:
recs_dict = ibcf.get_batch_recs(test_df.uid.unique(), N)

In [109]:
hit_ratio(recs_dict, test_dict)

0.08730741012472487

In [113]:
len(recs_dict[2])

30