In [1]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# %matplotlib inline
import datetime

import calendar
import datetime
import os
import time
from collections import Counter
# from tqdm import tqdm

# Данные

In [2]:
def load_and_adapt(path, last_months=0):
    file_ext = os.path.splitext(path)[-1]
    if file_ext == '.csv':
        data = pd.read_csv(path, header=0)
    elif file_ext == '.hdf':
        data = pd.read_hdf(path)
    else:
        raise ValueError('Unsupported file {} having extension {}'.format(path, file_ext))

    col_names = ['session_id', 'user_id', 'item_id', 'ts'] + data.columns.values.tolist()[4:]
    data.columns = col_names

    if last_months > 0:
        def add_months(sourcedate, months):
            month = sourcedate.month - 1 + months
            year = int(sourcedate.year + month / 12)
            month = month % 12 + 1
            day = min(sourcedate.day, calendar.monthrange(year, month)[1])
            return datetime.date(year, month, day)

        lastdate = datetime.datetime.fromtimestamp(data.ts.max())
        firstdate = add_months(lastdate, -last_months)
        initial_unix = time.mktime(firstdate.timetuple())

        # filter out older interactions
        data = data[data['ts'] >= initial_unix]

    return data

In [3]:
def create_seq_db_filter_top_k(path, topk=0, last_months=0):
    file = load_and_adapt(path, last_months=last_months)

    # group by session id and concat song_id
    groups = file.groupby('session_id')

    # convert item ids to string, then aggregate them to lists
    aggregated = groups['item_id'].aggregate(lambda x: list(map(int, x))).to_frame() ##было str
    init_ts = groups['ts'].min()
    users = groups['user_id'].min()  # it's just fast, min doesn't actually make sense

    result = aggregated.join(init_ts).join(users)
    result.reset_index(inplace=True)
    result.rename(columns={'item_id': 'sequence'}, inplace = True)
    return result

In [4]:
df_songs = create_seq_db_filter_top_k(path='6mDataset_lastFM_v9.csv') 

In [5]:
df_songs.head()

Unnamed: 0,session_id,sequence,ts,user_id
0,927,"[3313114, 3313106, 1847374, 3313138, 3312808]",1390344124,41837
1,933,"[2721045, 205473, 2224399, 2004872, 2061887, 2...",1390452313,41837
2,941,"[1274671, 2300890, 2301013, 2301184, 2301068, ...",1390525074,41837
3,945,"[2300890, 2300955, 2301184, 2300806, 2300818]",1390592062,41837
4,947,"[3003812, 122561, 4398503, 4398498, 4398495, 3...",1390599651,41837


Посчитаем статистики

In [6]:
from collections import Counter
cnt = Counter()
df_songs.sequence.map(cnt.update);

In [7]:
sequence_length = df_songs.sequence.map(len).values
n_sessions_per_user = df_songs.groupby('user_id').size()

print('Number of items: {}'.format(len(cnt)))
print('Number of users: {}'.format(df_songs.user_id.nunique()))
print('Number of sessions: {}'.format(len(df_songs)) )

print('\nSession length:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    sequence_length.mean(), 
    np.quantile(sequence_length, 0.5), 
    sequence_length.min(), 
    sequence_length.max()))

print('Sessions per user:\n\tAverage: {:.2f}\n\tMedian: {}\n\tMin: {}\n\tMax: {}'.format(
    n_sessions_per_user.mean(), 
    np.quantile(n_sessions_per_user, 0.5), 
    n_sessions_per_user.min(), 
    n_sessions_per_user.max()))

Number of items: 185914
Number of users: 1098
Number of sessions: 176998

Session length:
	Average: 9.41
	Median: 9.0
	Min: 5
	Max: 19
Sessions per user:
	Average: 161.20
	Median: 108.5
	Min: 1
	Max: 1344


In [8]:
print('Most popular items: {}'.format(cnt.most_common(5)))

Most popular items: [(232971, 651), (2536586, 621), (233271, 500), (1534850, 471), (2185901, 465)]


Проверим названия треков, на первый взгляд похоже на хиты 2015

In [35]:
track_check = pd.read_csv("LastFM_track_df_w_artist.csv")

In [36]:
print(track_check[track_check.track_id == 232971].track.values[0])
print(track_check[track_check.track_id == 2536586].track.values[0])
print(track_check[track_check.track_id == 233271].track.values[0])
print(track_check[track_check.track_id == 1534850].track.values[0])
print(track_check[track_check.track_id == 2185901].track.values[0])

Arctic+Monkeys/_/Do+I+Wanna+Know%3F
Sia/_/Chandelier
Arctic+Monkeys/_/R+U+Mine%3F
Lana+Del+Rey/_/West+Coast
Pharrell+Williams/_/Happy


### Разделим данные на трейн и тест

Делить будем по сессиям, т.е последние 35 сессий пользователя пойдут в тест.

In [11]:
def clean_split(train, test):
    """
    Remove new items from the test set.
    :param train: The training set.
    :param test: The test set.
    :return: The cleaned training and test sets.
    """
    train_items = set()
    train['sequence'].apply(lambda seq: train_items.update(set(seq)))
    test['sequence'] = test['sequence'].apply(lambda seq: [it for it in seq if it in train_items])
    return train, test


def last_session_out_split(data,
                           user_key='user_id',
                           session_key='session_id',
                           time_key='ts'):
    """
    Assign the last session of every user to the test set and the remaining ones to the training set
    """
    sessions = data.sort_values(by=[user_key, time_key]).groupby(user_key)[session_key]
    last_session = sessions.tail(35)
    train = data[~data.session_id.isin(last_session.values)].copy()
    test = data[data.session_id.isin(last_session.values)].copy()
    train, test = clean_split(train, test)
    return train, test

In [12]:
train_data, test_data = last_session_out_split(df_songs)

In [13]:
train_data.shape, test_data.shape

((140227, 4), (36771, 4))

In [14]:
# df_songs.ts = pd.to_datetime(df_songs.ts,  unit='s')
# df_songs.sort_values(by="ts",  inplace = True)

In [15]:
# train_data, test_data = df_songs[:125000],  df_songs[125000:]

In [16]:
train_data.head()

Unnamed: 0,session_id,sequence,ts,user_id
0,927,"[3313114, 3313106, 1847374, 3313138, 3312808]",1390344124,41837
1,933,"[2721045, 205473, 2224399, 2004872, 2061887, 2...",1390452313,41837
2,941,"[1274671, 2300890, 2301013, 2301184, 2301068, ...",1390525074,41837
3,945,"[2300890, 2300955, 2301184, 2300806, 2300818]",1390592062,41837
4,947,"[3003812, 122561, 4398503, 4398498, 4398495, 3...",1390599651,41837


In [17]:
test_data.head()

Unnamed: 0,session_id,sequence,ts,user_id
71,1305,"[1049133, 3292800, 548460, 1659836, 2660982, 1...",1404053242,41837
72,1311,"[3755400, 2077729, 3003731, 642325, 3618497, 3...",1404656240,41837
73,1314,"[2212473, 2495852, 60916, 3420925, 1660062, 21...",1404825707,41837
74,1320,"[3283748, 3283755, 3283688, 3283537, 3283601, ...",1405031715,41837
75,1322,"[3283727, 3283714, 3283673, 3283532, 3283744, ...",1405039511,41837


Проверим размеры последовательностей для теста

In [18]:
counter = 0
for i in test_data.sequence:
    if len(i) <5:
        counter+=1

In [19]:
counter

3409

Часть данных в тесте содержит последовательности менее 5, от них необходимо избавиться

In [20]:
min_k_seq = 5

def del_empty_sequence(full_sequence):
    last_k = 0
    if len(full_sequence) < min_k_seq:
        last_k == 0
    else:
        last_k +=1
    
    return last_k

In [21]:
test_data['empty_seq_flag'] = test_data['sequence'].apply(del_empty_sequence)

In [22]:
test_data = test_data[test_data.empty_seq_flag==1]

Для теста будем использовать последние 5 треков

In [23]:
last_k_seq = 5

def crop_sequence(full_sequence):
    last_k_sequence = []

    if len(full_sequence) > last_k_seq:
        last_k_sequence.append(full_sequence[-last_k_seq:])
    else:
        last_k_sequence.append(full_sequence)
    
    return last_k_sequence[0]

In [24]:
test_data['sequence'] = test_data['sequence'].apply(crop_sequence)

In [25]:
train_data.shape, test_data.shape

((140227, 4), (33362, 5))

# Метрики оценки качества модели

В качестве метрики - Recall

In [26]:
def recall(ground_truth, prediction):
    """
    Compute Recall metric
    :param ground_truth: the ground truth set or sequence
    :param prediction: the predicted set or sequence
    :return: the value of the metric
    """
#     ground_truth = remove_duplicates(ground_truth)
#     prediction = remove_duplicates(prediction)
    recall_score = count_a_in_b_unique(prediction, ground_truth)
    assert 0 <= recall_score <= 1
    return recall_score

def count_a_in_b_unique(a, b):
    """
    :param a: list of lists
    :param b: list of lists
    :return: number of elements of a in b
    """
    count = 0
    for el in a:
        if el in b:
            count += 1
    return count


def remove_duplicates(l):
    return [list(x) for x in set(tuple(x) for x in l)]

# Compact Prediction Tree

Алгоритм предсказания на основе частых последовательностей: https://cpt.readthedocs.io/en/latest/intro.html

In [27]:
train_ = train_data.sequence.values.tolist()
test_ = test_data.sequence.values.tolist()

In [28]:
len(train_), len(test_)

(140227, 33362)

In [29]:
from cpt.cpt import Cpt

Обучение

In [30]:
recommender = Cpt()
recommender.fit(train_)

Оценка алгоритма происходит следующим образом: алгоритм предсказывает следующий трек исходя из поданной последовательности длиной k и рассчитывается метрика Recall, при этом k итерационно увеличивается и Recall усредняется по всей тестовой выборке для разной длины последователньости k.

In [31]:
from tqdm import tqdm

""" given_k количество треков от начала последовательности, 
которые будут учитываться как в рамках сессии, т.е. k=1, след только первый эелемент сессии"""
given_k = 1 

""" look_ahead количество элементов, которые считаются gt меткой"""
look_ahead=1

""" step шаг с которым происходит обновление given_k на след. итерации"""
step = 1

""" длина тестовой сессии"""
profile_len = 5


""" длина списка рекомендаций"""
top_n = 5

def sliding_evaluation_func(test_data, recommender, given_k = given_k, look_ahead = look_ahead, 
                            step= step, profile_len = profile_len, 
                           top_n = top_n):
    res = []

    test_data = np.array(test_data)
    eval_res = 0.0
    eval_cnt = 0

    for k in range(given_k, profile_len, step):
        profile = (test_data[:,:k]).tolist()
        gt = (test_data[:,k]).tolist()
        preds = recommender.predict_k(profile, top_n)

        recall_total = 0

        for i in range(0, len(preds)):
#             print(profile[i], [gt[i]],preds[i])
            recall_total+=recall([gt[i]],preds[i])
        res.append(recall_total/len(preds))
        
    return np.mean(res)

In [32]:
recall_at_5_CPT = sliding_evaluation_func(test_, recommender)

In [33]:
print('Recall at 5 predictions for CPT++ algorithm: ',recall_at_5_CPT)

Recall at 5 predictions for CPT++ algorithm:  0.3004241352436904


__Пример работы алгоритма__

In [107]:
profile = (test_[0][:k])
profile

[548460]

In [34]:
for k in range(given_k, profile_len, step):
    profile = (test_[36][:k])
    gt = (test_[36][k])
    preds = recommender.predict_k([profile], top_n)
    print('profile: ', profile)
    print('gt: ', gt)
    print('predictions: ', preds)

profile:  [3440873]
gt:  3440875
predictions:  [[3440894, 3440886, 3440875, 3440902, 3440900]]
profile:  [3440873, 3440875]
gt:  3440902
predictions:  [[3440894, 3440886, 3440902, 3440900, 3440874]]
profile:  [3440873, 3440875, 3440902]
gt:  3440890
predictions:  [[3440894, 3440886, 3440900, 3440890, 3440874]]
profile:  [3440873, 3440875, 3440902, 3440890]
gt:  3440900
predictions:  [[3440894, 3440886, 3440874, 3440877, 3440900]]


In [38]:
ids = [3440894, 3440886, 3440875, 3440902, 3440900]

In [45]:
print('profile: ', track_check[track_check.track_id.isin([3440873])].track.values[0])
print('gt: ', track_check[track_check.track_id.isin([3440875])].track.values[0])
print('predictions: ', track_check[track_check.track_id.isin(ids)].track.values)

profile:  Heart+of+a+Coward/_/All+Eyes+To+The+Sky
gt:  Heart+of+a+Coward/_/Around+A+Girl+(In+80+Days)
predictions:  ['Heart+of+a+Coward/_/We+Stand+As+One'
 'Heart+of+a+Coward/_/Hope+and+hindrance' 'Heart+of+a+Coward/_/Nightmare'
 'Heart+of+a+Coward/_/Shade'
 'Heart+of+a+Coward/_/Around+A+Girl+(In+80+Days)']


In [47]:
prof_1 = [3440873, 3440875]
preds_1 = [3440894, 3440886, 3440902, 3440900, 3440874]

In [48]:
print('profile: ', track_check[track_check.track_id.isin(prof_1)].track.values)
print('gt: ', track_check[track_check.track_id.isin([3440902])].track.values[0])
print('predictions: ', track_check[track_check.track_id.isin(preds_1)].track.values)

profile:  ['Heart+of+a+Coward/_/All+Eyes+To+The+Sky'
 'Heart+of+a+Coward/_/Around+A+Girl+(In+80+Days)']
gt:  Heart+of+a+Coward/_/We+Stand+As+One
predictions:  ['Heart+of+a+Coward/_/We+Stand+As+One'
 'Heart+of+a+Coward/_/And+Only+Time+Will+Tell'
 'Heart+of+a+Coward/_/Hope+and+hindrance' 'Heart+of+a+Coward/_/Nightmare'
 'Heart+of+a+Coward/_/Shade']
