## Apriori
При помощи алгоритма apriori найдем закономерности в покупках из датасета, то есть, например, какой товар покупают часто с другим товаром  
В дальнейшем этим можно пробовать расширять список рекоммендаций для пользователя  
Здесь можно найти пояснения для всех параметров и результатов использования этого алгоритма:  
https://stackabuse.com/association-rule-mining-via-apriori-algorithm-in-python

! Можно использовать режим "Run all"

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from tqdm import tqdm
from efficient_apriori import apriori
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 604 ms


In [2]:
df = pd.read_csv("data/main.csv")

df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

In [3]:
df.head()

Unnamed: 0,user_id,time,cart
0,2,2015-03-22 09:25:46,399
1,2,2015-03-22 09:25:46,14
2,2,2015-03-22 09:25:46,198
3,2,2015-03-22 09:25:46,88
4,2,2015-03-22 09:25:46,157


In [4]:
t = df.groupby(["user_id", "time"])["cart"].progress_apply(lambda x: tuple(x)).to_frame().reset_index()

100%|███████████████████████████████████████████████████████████████████████| 209406/209406 [00:05<00:00, 39424.89it/s]


In [5]:
t.head()

Unnamed: 0,user_id,time,cart
0,0,2020-07-19 09:59:17,"(20, 82, 441, 57, 14, 405, 430, 379)"
1,0,2020-08-24 08:55:32,"(133, 5, 26, 10, 382, 14, 22, 41, 25, 441, 411..."
2,0,2020-09-02 07:38:25,"(803, 170, 84, 61, 440, 57, 55, 401, 398, 399,..."
3,1,2019-05-08 16:09:41,"(55,)"
4,1,2020-01-17 14:44:23,"(82, 798, 86, 421, 204, 55)"


In [6]:
itemsets, rules = apriori(t["cart"].tolist(), min_support=0.1,  min_confidence=0.5)

In [7]:
arr_first = []
arr_second = []
arr_conf = []
arr_supp = []
arr_lift = []

for rule in rules:
    arr_first.append(list(rule.lhs))
    arr_second.append(list(rule.rhs))
    arr_conf.append(rule.confidence)
    arr_supp.append(rule.support)
    arr_lift.append(rule.lift)
    print(rule)

{57} -> {14} (conf: 0.516, supp: 0.268, lift: 1.151, conv: 1.140)
{14} -> {57} (conf: 0.598, supp: 0.268, lift: 1.151, conv: 1.195)
{430} -> {14} (conf: 0.513, supp: 0.108, lift: 1.143, conv: 1.131)
{430} -> {57} (conf: 0.826, supp: 0.174, lift: 1.588, conv: 2.753)
{22} -> {14} (conf: 0.587, supp: 0.192, lift: 1.309, conv: 1.335)
{41} -> {14} (conf: 0.518, supp: 0.109, lift: 1.154, conv: 1.143)
{84} -> {14} (conf: 0.538, supp: 0.176, lift: 1.199, conv: 1.193)
{382} -> {14} (conf: 0.543, supp: 0.122, lift: 1.210, conv: 1.206)
{383} -> {14} (conf: 0.545, supp: 0.127, lift: 1.215, conv: 1.212)
{402} -> {14} (conf: 0.550, supp: 0.131, lift: 1.227, conv: 1.226)
{409} -> {14} (conf: 0.606, supp: 0.174, lift: 1.352, conv: 1.401)
{22} -> {57} (conf: 0.672, supp: 0.220, lift: 1.293, conv: 1.464)
{41} -> {57} (conf: 0.557, supp: 0.117, lift: 1.070, conv: 1.083)
{84} -> {57} (conf: 0.671, supp: 0.220, lift: 1.291, conv: 1.459)
{382} -> {57} (conf: 0.618, supp: 0.138, lift: 1.189, conv: 1.257)
{38

In [8]:
arr_first[:3]

[[57], [14], [430]]

In [9]:
ap = pd.DataFrame()
ap["from"] = arr_first
ap["to"] = arr_second
ap["conf"] = arr_conf
ap["supp"] = arr_supp
ap["lift"] = arr_lift

In [10]:
ap = ap.sort_values(["supp","conf"], ascending=False).head(20)

In [11]:
ap["from"] = ap["from"].apply(lambda x: x[0])
ap["to"] = ap["to"].apply(lambda x: x[0])

In [12]:
ap.head()

Unnamed: 0,from,to,conf,supp,lift
19,61,57,0.710759,0.310712,1.367021
20,57,61,0.597601,0.310712,1.367021
1,14,57,0.598316,0.268455,1.150757
0,57,14,0.516326,0.268455,1.150757
22,398,57,0.645935,0.251994,1.242343


In [13]:
ap.to_csv("apriori_top_20.csv", index=False)