In [420]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.sparse import find

In [421]:
JSON_DATA_PATH = "data/reviews.json"
N = 10

In [422]:
import json

def iter_json_data(path):
    with open(path) as f:
        for line in f:
            data = json.loads(line)
            yield data
            
def get_data_frame():
    uid_to_id = {}
    iid_to_id = {}
    
    cols = ["uid", "iid", "review", "rating", "dt", "helpful", "summary"]
    rows = []
    for d in iter_json_data(JSON_DATA_PATH):
        uid = uid_to_id.setdefault(d["reviewerID"], len(uid_to_id))
        iid = iid_to_id.setdefault(d["asin"], len(iid_to_id))
        review = d["reviewText"]
        rating = float(d["overall"])
        dt = int(d["unixReviewTime"])
        helpful = 0 if d["helpful"][1] == 0 else d["helpful"][0]/d["helpful"][1]
        summary = d["summary"]
        rows.append((uid, iid, review, rating, dt, helpful, summary))
        
    return pd.DataFrame(rows, columns=cols)

In [423]:
df = get_data_frame()
df.head()

Unnamed: 0,uid,iid,review,rating,dt,helpful,summary
0,0,0,Installing the game was a struggle (because of...,1.0,1341792000,0.666667,Pay to unlock content? I don't think so.
1,1,0,If you like rally cars get this game you will ...,4.0,1372550400,0.0,Good rally game
2,2,0,1st shipment received a book instead of the ga...,1.0,1403913600,0.0,Wrong key
3,3,0,"I got this version instead of the PS3 version,...",3.0,1315958400,0.7,"awesome game, if it did not crash frequently !!"
4,4,0,I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,1308009600,1.0,DIRT 3


In [424]:
def split_df_by_dt(df, p=0.8):
    """Функция разбивает df на тестовую и тренировочную выборки по времени 
    публикации отзывов (значение времени в поле dt)
    
    :param p: персентиль значений dt, которые образуют тренировочную выборку. Например p=0.8 означает, что в 
    тренировочной части будут отзывы, соответствующие первым 80% временного интервала 
    :return: два pd.DataFrame объекта
    """
    border_dt = df.dt.quantile(p)
    print("Min=%s, border=%s, max=%s" % (df.dt.min(), border_dt, df.dt.max()))
    training_df, test_df  = df[df.dt <= border_dt], df[df.dt > border_dt]
    print("Размер до очистки:", training_df.shape, test_df.shape)
    # удаляем из тестовых данных строки, соответствующие пользователям или объектам, 
    # которых нет в тренировочных данных 
    # (пользователи - избегаем проблем для персональных систем, объекты - для всех)
    test_df = test_df[test_df.uid.isin(training_df.uid) & test_df.iid.isin(training_df.iid)]
    print("Размер после очистки:", training_df.shape, test_df.shape)
    return training_df, test_df

In [425]:
def clean_df(df, min_review_per_uid, min_review_per_iid):
    """Функция удаляет из df строки, соответствующие пользователям и объектам, 
    у которых меньше min_review_per_uid и min_review_per_iid отзывов соответственно
    """
    _df = df.copy()
    while True:
        review_per_uid = _df.groupby("uid").review.count()
        bad_uids = review_per_uid[review_per_uid < min_review_per_uid].index
    
        review_per_iid = _df.groupby("iid").review.count()
        bad_iids = review_per_iid[review_per_iid < min_review_per_iid].index
        
        if bad_uids.shape[0] > 0 or bad_iids.shape[0] > 0:
            _df = _df[(~_df.uid.isin(bad_uids)) & (~_df.iid.isin(bad_iids))]
        else:
            break
    return _df

In [426]:
def hit_ratio(recs_dict, test_dict):
    """Функция считает метрику hit-ration для двух словарей
    :recs_dict: словарь рекомендаций типа {uid: {iid: score, ...}, ...}
    :test_dict: тестовый словарь типа {uid: {iid: score, ...}, ...}
    """
    hits = 0
    for uid in test_dict:
#         print(test_dict[uid].keys())
#         print(recs_dict.get(uid, {}))
        if set(test_dict[uid].keys()).intersection(recs_dict.get(uid, {})):
            hits += 1
    return hits / len(test_dict)

In [427]:
training_df, test_df = split_df_by_dt(clean_df(df, 5, 20))
del df

Min=943747200, border=1379635200.0, max=1405987200
Размер до очистки: (80884, 7) (20212, 7)
Размер после очистки: (80884, 7) (8198, 7)


In [428]:
def get_test_dict(test_df):
    """Функция, конвертирующая тестовый df в словарь
    """
    test_dict = {}
    for t in test_df.itertuples():
        test_dict.setdefault(t.uid, {})
        test_dict[t.uid][t.iid] = t.rating
    return test_dict

test_dict = get_test_dict(test_df)

In [429]:
def get_topk(matrix, top, axis=1):
    """Converts source matrix to Top-K matrix
    where each row or column contains only top K values

    :param matrix: source matrix
    :param top: number of top items to be stored
    :param axis: 0 - top by column, 1 - top by row
    :return:
    """
    rows = []
    cols = []
    data = []

    if axis == 0:
        matrix = matrix.T.tocsr()

    for row_id, row in enumerate(matrix):
        if top is not None and row.nnz > top:
            top_args = np.argsort(row.data)[-top:]

            rows += [row_id] * top
            cols += row.indices[top_args].tolist()
            data += row.data[top_args].tolist()
        elif row.nnz > 0:
            rows += [row_id] * row.nnz
            cols += row.indices.tolist()
            data += row.data.tolist()

    topk_m = csr_matrix((data, (rows, cols)), (matrix.shape[0], matrix.shape[1]))

    if axis == 0:
        topk_m = topk_m.T.tocsr()

    return topk_m

In [430]:
# вспомогательные функции, которые могут пригодиться при построении Item-based CF
def nullify_main_diagonal(m):
    positions = range(m.shape[0])
    eye = csr_matrix((np.ones(len(positions)), (positions, positions)), m.shape)
    return m - m.multiply(eye)

In [431]:
from scipy.sparse import csr_matrix
# генерация матрицы user-item 
def get_user_item_matrix(df):
    values = []
    rows = []
    cols = []
        
    uniq_iids = df.iid.unique()
    
    iid_group_by_user_df = df.groupby('uid')['iid'].apply(list).reset_index()
    rating_group_by_user_df = df.groupby('uid')['rating'].apply(list).reset_index()
    # собираем список итемов с рейтингами пример uid = 0	 iid [0, 1432, 3081, 3135, 3964, 6598]	 
    # rating[1.0, 5.0, 1.0, 5.0, 1.0, 2.0]    
    group_by_user_df = iid_group_by_user_df.merge(rating_group_by_user_df, on=['uid'], how='left')
    
    for index, row in group_by_user_df.iterrows():
        rows.extend([index] * len(row['iid']))
        cols.extend([np.where(uniq_iids==el)[0][0] for el in row['iid']])
        values.extend([el for el in row['rating']])
        
    csr = csr_matrix((values, (rows, cols)))
    
    # вернем не только матрицу, но и списки уникальных id для item и user, чтобы потом понимать окаких пользователях 
    # и играх идет речь
    
    return csr, group_by_user_df.uid, uniq_iids 

In [432]:
class BasicRecommender(object):
    def __init__(self):
        pass
    
    def get_recs(self, uid, top):
        """Строит рекомендации для пользователя uid
        :return: словарь типа {iid: score, ...}
        """
        return {}
    
    def get_batch_recs(self, uids, top):
        """Строит рекомендации для нескольких пользователей uids
        :return: словарь типа {uid: {iid: score, ...}, ...}
        """
        return {uid: self.get_recs(uid, top) for uid in uids}
    
class NonPersRecommender(BasicRecommender):
    def __init__(self, df):
        super(NonPersRecommender, self).__init__()
        self.recs = self._prepare_recs(df)
        
    def _prepare_recs(self, df):
        return pd.Series([])
    
    def get_recs(self, uid, top):
        return self.recs[:top].to_dict()
    
    def get_batch_recs(self, uids, top):
        non_pers_recs = self.get_recs(None, top)
        return {uid: non_pers_recs for uid in uids}
    
class MostReviewedRS(NonPersRecommender):
    def _prepare_recs(self, df):
        # считаем количество отзывов для каждого объекта (pandas сортирует их по убыванию)

        return df.iid.value_counts()

In [433]:
class MeanRatingRS(NonPersRecommender):
    
    def __init__(self, df, month, year):
        
        self.y = year
        self.m = month
        self.recs = self._prepare_recs(df)
        
    
    def _prepare_recs(self, df):
        df['date'] = pd.to_datetime(df['dt'],unit='s')
        # посчитаем средний рейтинг, количество оценок и дату самого раннего отзыва
        d = df.groupby('iid').agg({"rating": {"mean", "count"}, "date" : {"min"}}).reset_index()
        d.columns = d.columns.droplevel(0)
        d.rename(columns={'min': 'date'}, inplace=True)
        d['date'] = pd.to_datetime(d['date'],unit='s')
        #      
        year = d['date'].max().year
        month = d['date'].max().month
        if month + self.m < 0:
            diff = abs(month + self.m)
        else:
            diff = month + self.m
        
        d = d[(d['date'].dt.year > year + self.y) & (d['date'].dt.month > diff)] \
            .sort_values(['count'], ascending = False)
            
        d.rename(columns={'': 'iid'}, inplace=True)
        d.set_index('iid', inplace=True)
        d = d[(d['mean'] < 5) & (d['mean'] > 4)]
        return d['count'].T

In [434]:
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
class ItemBasedCollaborativeFilteringRS(BasicRecommender):
    
    
    def __init__(self, df):
        super(ItemBasedCollaborativeFilteringRS, self).__init__()
        self.recs = self._prepare_recs(df)
    
    def get_recs(self, uid, top):
        
        index = np.where(self.rows == uid)[0]
        iid_dict = {}
        has_iid = list(self.group_by_user_df[self.group_by_user_df.uid == uid].iid)[0]
        
        recs = self.csr_normalized[index].dot(self.similarities_sparse.T)
        
        for arg_id in np.argsort(recs.data)[-top*3:][::-1]:
            iid = recs.indices[arg_id]
            score = recs.data[arg_id]
            if not iid is has_iid and len(iid_dict) < top:
                iid_dict[self.cols[iid]] = score
        
        return iid_dict
    
    def get_batch_recs(self, uids, top):
        return {uid: self.get_recs(uid, top) for uid in uids}
    
    def _prepare_recs(self, df):
        self.csr, self.rows, self.cols = get_user_item_matrix(df)
        self.csr_normalized = normalize(self.csr)

        self.similarities_sparse = get_topk(nullify_main_diagonal(normalize(cosine_similarity(
            self.csr_normalized.T.tocsr(), dense_output=False))), 30)

        row, col, self.data = find(self.similarities_sparse)

        iid_group_by_user_df = df.groupby('uid')['iid'].apply(list).reset_index()
        rating_group_by_user_df = df.groupby('uid')['rating'].apply(list).reset_index()
        self.group_by_user_df = iid_group_by_user_df.merge(rating_group_by_user_df, on=['uid'], how='left')
        
        return self.similarities_sparse

In [435]:
test_dict = get_test_dict(test_df)

In [436]:
rs_most_reviewed = MostReviewedRS(training_df)
recs_dict_most = rs_most_reviewed.get_batch_recs(test_df.uid, N)

In [437]:
rs_mean_rating = MeanRatingRS(training_df, -6, -2)
recs_dict_mean = rs_mean_rating.get_batch_recs(test_df.uid, N)

In [438]:
ibcf = ItemBasedCollaborativeFilteringRS(training_df)
recs_dict_itcf = ibcf.get_batch_recs(test_df.uid.unique(), N)

In [439]:
d = training_df.groupby('iid').agg({"rating": {"count"}, "date" : {"min"}}).reset_index()
d.columns = d.columns.droplevel(0)
d.rename(columns={'min': 'date'}, inplace=True)
d.rename(columns={'': 'iid'}, inplace=True)
d = d.sort_values( ['date'], ascending = [ False])
d = d[d['count'] > 100]

In [440]:
new_pop_games = d.head(5)

In [441]:
gr = training_df.groupby('uid')['iid'].count().reset_index()

In [442]:
uid_most_buy = gr[gr['iid'] > 30]['uid']

In [494]:
# идея в следущем, если пользователь актвно покупал игры и ставил отзывы скорее всего для него менее актуальны топые игры
# для таких игроков рекоммендуем смесь из новых игр, набирающих популярность и тех, что нашли, с помощью 
# ItemBasedCollaborativeFilteringRS
# для менее активных клиентов возьмем смесь наиболее популярных и тех, у которых самый высокий средний рейтинг
result_dict = {}

iid_group_by_user_df = training_df.groupby('uid')['iid'].apply(list).reset_index()


for uid in test_df.uid.unique():
    recs_dict_most_s = sorted(recs_dict_most[uid].items(), key=lambda x: x[1])[::-1]
    recs_dict_mean_s = sorted(recs_dict_mean[uid].items(), key=lambda x: x[1])[::-1]
    recs_dict_itcf_s = sorted(recs_dict_itcf[uid].items(), key=lambda x: x[1])[::-1]

    has_iid = list(iid_group_by_user_df[iid_group_by_user_df.uid == uid].iid)[0]
    
    if (uid in uid_most_buy):
        temp_dict = recs_dict_itcf_s[0:4]
        for u in new_games.iid:
            if u in has_iid: 
                continue
                print('yep')
            temp_dict.append((u, 1))
            if len(temp_dict) >= N : break
        result_dict[uid] = dict(temp_dict)
    else:
        
        temp_dict = recs_dict_most_s[0:3]
        for tup in recs_dict_mean_s:
            if tup[0] in has_iid: 
                continue
                print('yep')
            temp_dict.append(tup)
            if len(temp_dict) >= N : break
        result_dict[uid] = dict(temp_dict)

In [495]:
hit_ratio(result_dict, test_dict)

0.09773013871374527