In [None]:
# импортируем необходимые библиотеки
import pandas as pd
from scipy.sparse import csr_matrix
from implicit.cpu.als import AlternatingLeastSquares
from implicit.evaluation import mean_average_precision_at_k

In [2]:
df = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])

In [3]:
df

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0


In [4]:
df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   user_id        5476251 non-null  int64         
 1   item_id        5476251 non-null  int64         
 2   last_watch_dt  5476251 non-null  datetime64[ns]
 3   total_dur      5476251 non-null  int64         
 4   watched_pct    5475423 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


In [5]:
df['watched_pct'] = df['watched_pct'].fillna(0) #заполним пропущенные значения нулями

In [6]:
df['last_watch_dt'].min(), df['last_watch_dt'].max()

(Timestamp('2021-03-13 00:00:00'), Timestamp('2021-08-22 00:00:00'))

In [7]:
#разделим данные на обучающую и тестовую выборку - для теста возьмем последние 7 дней
test = df[df['last_watch_dt'] > '2021-08-15']
train = df[df['last_watch_dt'] <= '2021-08-15']
train

Unnamed: 0,user_id,item_id,last_watch_dt,total_dur,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
...,...,...,...,...,...
5476245,786732,4880,2021-05-12,753,0.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476249,384202,16197,2021-04-19,6203,100.0


In [11]:
train.nunique()

user_id          906071
item_id           15577
last_watch_dt       156
total_dur        126663
watched_pct         101
dtype: int64

In [12]:
test.nunique()

user_id          167348
item_id            7106
last_watch_dt         7
total_dur         38328
watched_pct         101
dtype: int64

In [9]:
#предположим, что процент времени просмотра линейно связан с вероятностью того, что контент понравился

train_df = train[['user_id', 'item_id', 'watched_pct']].copy()
test_df = test[['user_id', 'item_id', 'watched_pct']].copy()

train_df['watched_pct'] = train_df['watched_pct'] / 100
test_df['watched_pct'] = test_df['watched_pct'] / 100

#создадим разреженные матрицы взаимодействий users-items для обучающей и тестовой выборок

userid = list(train_df['user_id'].unique())
itemid = list(train_df['item_id'].unique())
data = train_df['watched_pct'].tolist()
row = pd.Categorical(train_df['user_id'], categories=userid, ordered=True).codes
col = pd.Categorical(train_df['item_id'], categories=itemid, ordered=True).codes
train_matrix = csr_matrix((data, (row, col)), shape=(len(userid), len(itemid)))

userid = list(test_df['user_id'].unique())
itemid = list(test_df['item_id'].unique())
data = test_df['watched_pct'].tolist()
row = pd.Categorical(test_df['user_id'], categories=userid, ordered=True).codes
col = pd.Categorical(test_df['item_id'], categories=itemid, ordered=True).codes
test_matrix = csr_matrix((data, (row, col)), shape=(len(userid), len(itemid)))

In [10]:
train_matrix, test_matrix

(<906071x15577 sparse matrix of type '<class 'numpy.float64'>'
 	with 5051815 stored elements in Compressed Sparse Row format>,
 <167348x7106 sparse matrix of type '<class 'numpy.float64'>'
 	with 424436 stored elements in Compressed Sparse Row format>)

In [17]:
#обучим рекомендательный алгоритм

model = AlternatingLeastSquares(factors=64, regularization=0.05, random_state=42)
model.fit(train_matrix)


  0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
#посчитаем метрику map@10 
mean_average_precision_at_k(model, train_matrix, test_matrix, K=10)

  0%|          | 0/167348 [00:00<?, ?it/s]

0.0034842071949627938