In [1]:
import sys
sys.path.append("third_code/recommenders-master/")

import logging
import numpy as np
import pandas as pd
import papermill as pm
import time
import os 
os.environ['NUMEXPR_MAX_THREADS'] = '64'

from reco_utils.common.timer import Timer
from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_stratified_split
from reco_utils.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from reco_utils.recommender.sar import SAR

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))

System version: 3.6.9 |Anaconda, Inc.| (default, Jul 30 2019, 19:07:31) 
[GCC 7.3.0]
Pandas version: 0.25.3


In [2]:
def filter_entity_row(data):
    """删除完全一致的行"""
    t = (2020, 4, 10, 0, 0, 0, 0, 0, 0)
    time_end = time.mktime(t)

    data['time_diff'] = data['qtime'].diff() * time_end
    first_rec = data.time_diff.isnull() | (data.time_diff < 0)
    data.loc[first_rec, 'time_diff'] = -1
    data['timestamp'] = data['qtime'] * time_end
    return data[data['time_diff'] != 0]

    
def gen_data(dir_name, predict=False, drop_dup=True):    
    all_files = [dir_name+f for f in os.listdir(dir_name) if "csv" in f and 'click' in f]
    li = []
    for filename in all_files:
        print(filename)
        df = pd.read_csv(filename, header=None, names=['user_id', 'item_id', 'qtime'])
        li.append(df)

    df_full = pd.concat(li, axis=0, ignore_index=True)
    df_full.sort_values(by=['user_id', 'qtime'], inplace=True)
    print("before drop length:", len(df_full))
    if drop_dup:
        df_full = filter_entity_row(df_full)
        print("after drop entity rowlength:", len(df_full))
    return df_full

In [3]:
train_dir = "../data/underexpose_train/"
test_dir = "../data/underexpose_test/"

In [4]:
train_data = gen_data(train_dir)
test_data = gen_data(test_dir)
whole_click = pd.concat([train_data, test_data])
whole_click['rating'] = 1
whole_qtime = test_data.drop_duplicates(subset=['user_id'])[['user_id']]

../data/underexpose_train/underexpose_train_click-1.csv
../data/underexpose_train/underexpose_train_click-2.csv
../data/underexpose_train/underexpose_train_click-0.csv
before drop length: 727485
after drop entity rowlength: 456951
../data/underexpose_test/underexpose_test_click-1.csv
../data/underexpose_test/underexpose_test_click-0.csv
../data/underexpose_test/underexpose_test_click-2.csv
before drop length: 68426
after drop entity rowlength: 68418


In [5]:
train, test = python_stratified_split(whole_click, ratio=0.75, col_user='user_id', col_item='item_id', seed=42)

In [6]:
print("""
Train:
Total Ratings: {train_total}
Unique Users: {train_users}
Unique Items: {train_items}

Test:
Total Ratings: {test_total}
Unique Users: {test_users}
Unique Items: {test_items}
""".format(
    train_total=len(train),
    train_users=len(train['user_id'].unique()),
    train_items=len(train['item_id'].unique()),
    test_total=len(test),
    test_users=len(test['user_id'].unique()),
    test_items=len(test['item_id'].unique()),
))


Train:
Total Ratings: 393650
Unique Users: 23816
Unique Items: 61791

Test:
Total Ratings: 131719
Unique Users: 23692
Unique Items: 50978



In [31]:
logging.basicConfig(level=logging.DEBUG, 
                    format='%(asctime)s %(levelname)-8s %(message)s')

model = SAR(
    col_user="user_id",
    col_item="item_id",
    col_rating="qtime",
    col_timestamp="timestamp",
    similarity_type="cooccurrence", 
    time_decay_coefficient=3, 
    timedecay_formula=True
)

In [32]:
with Timer() as train_time:
    model.fit(whole_click)

print("Took {} seconds for training.".format(train_time.interval))

2020-04-18 21:08:09,820 INFO     Collecting user affinity matrix
2020-04-18 21:08:09,828 INFO     Calculating time-decayed affinities
2020-04-18 21:08:09,974 INFO     Creating index columns
2020-04-18 21:08:10,661 INFO     Building user affinity sparse matrix
2020-04-18 21:08:10,681 INFO     Calculating item co-occurrence
2020-04-18 21:08:11,751 INFO     Calculating item similarity
2020-04-18 21:08:11,752 INFO     Using co-occurrence based similarity
2020-04-18 21:08:11,753 INFO     Done training


Took 1.9589013820514083 seconds for training.


In [33]:
with Timer() as test_time:
    top_k = model.recommend_k_items(test, remove_seen=True, top_k=50)

2020-04-18 21:08:14,456 INFO     Calculating recommendation scores
2020-04-18 21:08:25,982 INFO     Removing seen items


In [34]:
TOP_K = 50
args = [test, top_k]
kwargs = dict(col_user='user_id', 
              col_item='item_id', 
              col_rating='qtime', 
              col_prediction='prediction', 
              relevancy_method='top_k', 
              k=TOP_K)


eval_map = map_at_k(*args, **kwargs)
eval_ndcg = ndcg_at_k(*args, **kwargs)
eval_precision = precision_at_k(*args, **kwargs)
eval_recall = recall_at_k(*args, **kwargs)

In [35]:
print(f"Model:",
      f"Top K:\t\t {TOP_K}",
      f"MAP:\t\t {eval_map:f}",
      f"NDCG:\t\t {eval_ndcg:f}",
      f"Precision@K:\t {eval_precision:f}",
      f"Recall@K:\t {eval_recall:f}", sep='\n')

Model:
Top K:		 50
MAP:		 0.000000
NDCG:		 0.000000
Precision@K:	 0.000000
Recall@K:	 0.000000


In [41]:
with Timer() as test_time:
    top_k = model.recommend_k_items(whole_qtime, remove_seen=True, top_k=50)

print("Took {} seconds for prediction.".format(test_time.interval))
sub = top_k.groupby('user_id')['item_id'].apply(lambda x: ','.join(
        [str(i) for i in x])).str.split(',', expand=True).reset_index()

sub.to_csv("sub_sar_drop_dup.csv", header=None, index=False)