### ML
Суть решения в том, чтобы собрать статистики по пользователю и по предмету и к каждой комбинации этих статистик предсказывать 0 или 1  
Использовался CatBoost - градиентный бустинг над решающими деревьями  
+Также добавлялись сюда рекомендации от TIFU KNN, это дало хороший прирост +0.08 к метрике, в итоге результат около 0.40, как в случае и с TIFU KNN

! Можно запускать в режиме "Run all"

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 2.08 s


### Unpack the data

In [2]:
df = pd.read_csv("data/main.csv")

df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

In [3]:
def duplicates_to_count(t):
    return t.groupby(['user_id', 'time'])['cart'].value_counts() \
                                                  .to_frame() \
                                                  .rename(columns={"cart":"count"}) \
                                                  .reset_index()

def count_to_duplicates(t):
    g = t.copy()
    g["to_explode"] = g["count"].apply(lambda x: [i for i in range(x)])
    g = g.explode("to_explode") \
         .drop(columns=["count", "to_explode"])
    return g

def make_train_targets(t, level=1):
    user_last_time = t.groupby(["user_id"])["time"].max().to_frame().reset_index()
    user_last_time["last_buy"] = 1
    
    train = pd.merge(t, user_last_time, on=["time", "user_id"], how="left")
    train = train[train["last_buy"] != 1]
    train.drop(columns=["last_buy"], inplace=True)
    
    if level >= 2:
        return make_train_targets(train, level-1)
    
    user_last_time.drop(columns=["last_buy"], inplace=True)
    
    user_last_carts = pd.merge(user_last_time, t, on=["user_id", "time"], how="left")
    
    skeleton = make_skeleton(t)
    targets = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")
    targets.fillna(0, inplace=True)
    targets["count"] = targets["count"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    targets.rename(columns={"count":"target"}, inplace=True)
    return train, targets

def make_skeleton(t):
    return t.groupby("user_id")["cart"].unique().to_frame().reset_index().explode("cart")


In [4]:
# Топ категорий пользователя
def user_top_k_carts(t, k):
    g = count_to_duplicates(t).groupby("user_id")["cart"].value_counts().to_frame().rename(columns={"cart":"count"})\
        .groupby("user_id")["count"].head(k).to_frame().reset_index().drop(columns=["count"])\
        .groupby("user_id")["cart"].agg([lambda x: x.tolist()]).rename(columns={"<lambda>":"user_top_"+str(k)+"_carts"}).reset_index()
    for i in tqdm(range(k)):
        g["user_top_"+str(i+1)+"_cart"] = g["user_top_"+str(k)+"_carts"].apply(lambda x: x[i] if len(x) > i else -1)
    return g.drop(columns=["user_top_"+str(k)+"_carts"]) 

# Кол-во заказов пользователя
def user_orders_count(t):
    return t.groupby(["user_id"])["time"].nunique().to_frame().reset_index().rename(columns={"time":"orders_count"})

# Кол-во вещей пользователя по всем заказам
def user_items_count(t):
    return t.groupby("user_id")["count"].sum().to_frame().reset_index().rename(columns={"count":"items_count"})

# Таблица как в TIFU KNN, сильно грузит память (около 30GB RAM), но прироста большого не дает
def user_pivot_table(t):
    g = pd.pivot_table(t, columns="cart", index="user_id", values="count", aggfunc=np.sum, fill_value=0)
    for cart in list(set(df["cart"].unique()).difference(set(t["cart"].unique()))):
        g[cart] = 0
    return g

# Количество уникальных категорий, в которых покупал пользователь
def user_carts_count(t):
    return t.groupby("user_id")["cart"].nunique().to_frame().reset_index().rename(columns={"cart":"carts_count"})

# Кол-во заказов каждой категории
def cart_orders_count(t):
    return t.groupby("cart")["count"].agg(["sum"]).reset_index().rename(columns={"sum":"cart_orders_count"})

# Кол-во пользователей, которые заказали товар из категории
def cart_unique_users_count(t):
    return t.groupby("cart")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":"cart_unique_users_count"})

# Среднее, мин., макс. и медиана количества товаров каждой категории в корзине (бесполезная)
def cart_mean_count(t):
    return t.groupby("cart")["count"].agg([("cart_mean_count","mean"),
                                           ("cart_min_count", "min"),
                                           ("cart_max_count", "max"),
                                           ("cart_median_count", "median")]).reset_index()

# Среднее, мин., макс. и медиана размера корзины пользователя за все заказы
def user_mean_cart_in_order_count(t):
    return t.groupby(["user_id","time"])["count"].sum().to_frame() \
                                                 .groupby(["user_id"]) \
                                                 .agg(user_mean_cart_in_order_count = ('count', 'mean'),
                                                      user_min_cart_in_order_count = ('count', 'min'),
                                                      user_max_cart_in_order_count = ('count', 'max'),
                                                      user_median_cart_in_order_count = ('count', 'median')).reset_index()
    
def make(df, skeleton):
    main = skeleton.copy()
    
    main = pd.merge(main, user_orders_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["orders_count"] = main["orders_count"].astype(int)
    
    main = pd.merge(main, user_items_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["items_count"] = main["items_count"].astype(int)
    
    main = pd.merge(main, user_carts_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["carts_count"] = main["carts_count"].astype(int)
    
    main = pd.merge(main, cart_orders_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_orders_count"] = main["cart_orders_count"].astype(int)
    
    main = pd.merge(main, cart_unique_users_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_unique_users_count"] = main["cart_unique_users_count"].astype(int)
    
    main = pd.merge(main, user_mean_cart_in_order_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    
    main = pd.merge(main, cart_mean_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    
    top_k = 5
    main = pd.merge(main, user_top_k_carts(df, top_k), on="user_id", how="left")
    main.fillna(-1, inplace=True)
    for i in range(top_k):
        main["user_top_"+str(i+1)+"_cart"] = main["user_top_"+str(i+1)+"_cart"].astype(int)
    

    return main

In [5]:
df = duplicates_to_count(df)

In [6]:
train_1, targets_1 = make_train_targets(df, level=1)

main_1 = make(train_1, targets_1)

main_1.head()

100%|████████████████████████████████████████████████████████████████████| 1117600/1117600 [00:01<00:00, 870506.87it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 108.76it/s]


Unnamed: 0,user_id,cart,target,orders_count,items_count,carts_count,cart_orders_count,cart_unique_users_count,user_mean_cart_in_order_count,user_min_cart_in_order_count,...,user_median_cart_in_order_count,cart_mean_count,cart_min_count,cart_max_count,cart_median_count,user_top_1_cart,user_top_2_cart,user_top_3_cart,user_top_4_cart,user_top_5_cart
0,0,14,0,2,33,27,85164,15273,16.5,8,...,16.5,1.0,1.0,1.0,1.0,14,57,82,379,405
1,0,20,0,2,33,27,13664,5906,16.5,8,...,16.5,1.0,1.0,1.0,1.0,14,57,82,379,405
2,0,57,1,2,33,27,98788,16336,16.5,8,...,16.5,1.0,1.0,1.0,1.0,14,57,82,379,405
3,0,82,0,2,33,27,24980,7493,16.5,8,...,16.5,1.0,1.0,1.0,1.0,14,57,82,379,405
4,0,379,0,2,33,27,19764,8470,16.5,8,...,16.5,1.0,1.0,1.0,1.0,14,57,82,379,405


In [7]:
recs_1 = pd.read_csv("data/train_lvl_1_recs.csv")

In [8]:
recs_1.head()

Unnamed: 0,user_id,items,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,"[57, 14, 84, 22, 82, 383, 409, 432, 379, 382, ...",57,14,84,22,82,383,409,432,379,382,61,5,430,23,41,398,441,402
1,1,"[55, 798, 169, 812, 171, 14, 170, 88, 198, 441...",55,798,169,812,171,14,170,88,198,441,104,806,57,61,172,808,304,63
2,2,"[57, 61, 23, 382, 82, 84, 403, 398, 22, 17, 38...",57,61,23,382,82,84,403,398,22,17,383,14,15,430,19,420,431,409
3,3,"[57, 61, 84, 398, 430, 382, 19, 41, 383, 22, 1...",57,61,84,398,430,382,19,41,383,22,14,16,42,43,402,23,15,17
4,4,"[57, 398, 61, 84, 22, 420, 17, 14, 430, 388, 1...",57,398,61,84,22,420,17,14,430,388,16,712,100,29,54,23,425,169


In [9]:
main_1 = pd.merge(main_1, recs_1, on="user_id", how="left")

In [10]:
main_array = main_1.to_numpy()
arr_has = []
arr_pos = []
index_of_items = main_1.columns.tolist().index("items")
for row in tqdm(main_array):
    v = row[index_of_items].replace(']','').replace('[','')
    v = v.split(", ")
    for i in range(len(v)):
        v[i] = int(v[i])
    if row[1] in v:
        arr_has.append(1)
        arr_pos.append(v.index(row[1]))
    else:
        arr_has.append(0)
        arr_pos.append(-1)
        
main_1["has_cart_in_recs"] = arr_has
main_1["pos_cart_in_recs"] = arr_pos

100%|████████████████████████████████████████████████████████████████████| 1117600/1117600 [00:10<00:00, 102568.74it/s]


In [11]:
main_1.drop(columns=["items"], inplace=True)

Далее можно расширить обучающую выборку, добавив к ней датасет в таком же формате, но собраный при level=2, например

In [None]:
# train_2, targets_2 = make_train_targets(df, level=2)

# main_2 = make(train_2, targets_2)

# recs_2 = pd.read_csv("data/train_lvl_2_recs.csv")

# main_2 = pd.merge(main_2, recs_2, on="user_id", how="left")

# main_array = main_2.to_numpy()
# arr_has = []
# arr_pos = []
# index_of_items = main_2.columns.tolist().index("items")
# for row in tqdm(main_array):
#     v = row[index_of_items].replace(']','').replace('[','')
#     v = v.split(", ")
#     for i in range(len(v)):
#         v[i] = int(v[i])
#     if row[1] in v:
#         arr_has.append(1)
#         arr_pos.append(v.index(row[1]))
#     else:
#         arr_has.append(0)
#         arr_pos.append(-1)
        
# main_2["has_cart_in_recs"] = arr_has
# main_2["pos_cart_in_recs"] = arr_pos

# main_2.drop(columns=["items"], inplace=True)

In [12]:
main_final = main_1.copy() # or pd.concat([main_1, main_2])

In [13]:
main_final.shape

(1117600, 41)

In [14]:
main_final.head()

Unnamed: 0,user_id,cart,target,orders_count,items_count,carts_count,cart_orders_count,cart_unique_users_count,user_mean_cart_in_order_count,user_min_cart_in_order_count,...,10,11,12,13,14,15,16,17,has_cart_in_recs,pos_cart_in_recs
0,0,14,0,2,33,27,85164,15273,16.5,8,...,61,5,430,23,41,398,441,402,1,1
1,0,20,0,2,33,27,13664,5906,16.5,8,...,61,5,430,23,41,398,441,402,0,-1
2,0,57,1,2,33,27,98788,16336,16.5,8,...,61,5,430,23,41,398,441,402,1,0
3,0,82,0,2,33,27,24980,7493,16.5,8,...,61,5,430,23,41,398,441,402,1,4
4,0,379,0,2,33,27,19764,8470,16.5,8,...,61,5,430,23,41,398,441,402,1,8


In [15]:
top_k = 5
cat_features = ['cart'] + ["has_cart_in_recs"] + ["user_top_"+str(i+1)+"_cart" for i in range(top_k)] + [str(i) for i in range(15)]
cat_features

['cart',
 'has_cart_in_recs',
 'user_top_1_cart',
 'user_top_2_cart',
 'user_top_3_cart',
 'user_top_4_cart',
 'user_top_5_cart',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14']

In [16]:
x_train, x_validation, y_train, y_validation = train_test_split(main_final.drop(columns=["user_id", "target"]), 
                                                                main_final["target"], 
                                                                stratify=main_final["target"],
                                                                test_size=0.33, 
#                                                                 random_state=42
                                                               )

In [17]:
model = CatBoostClassifier(iterations=300,
#                             depth = 6,
                            learning_rate = 0.35,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            # leaf_estimation_iterations = 10,
                            # max_ctr_complexity = 1,
                            random_seed= 127,
                            cat_features = cat_features
                           )

In [18]:
model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
          early_stopping_rounds=50,  
#           plot=True, 
          verbose=10
          )

0:	learn: 0.1099941	test: 0.1126340	best: 0.1126340 (0)	total: 1.21s	remaining: 6m 2s
10:	learn: 0.2315188	test: 0.2350459	best: 0.2350459 (10)	total: 13.4s	remaining: 5m 52s
20:	learn: 0.3014821	test: 0.3070614	best: 0.3070614 (20)	total: 27s	remaining: 5m 58s
30:	learn: 0.3299170	test: 0.3361981	best: 0.3361981 (30)	total: 40.8s	remaining: 5m 53s
40:	learn: 0.3392211	test: 0.3439120	best: 0.3439120 (40)	total: 54.6s	remaining: 5m 45s
50:	learn: 0.3450597	test: 0.3478453	best: 0.3478769 (49)	total: 1m 7s	remaining: 5m 30s
60:	learn: 0.3530195	test: 0.3534876	best: 0.3537857 (57)	total: 1m 21s	remaining: 5m 20s
70:	learn: 0.3582180	test: 0.3601338	best: 0.3601338 (70)	total: 1m 35s	remaining: 5m 8s
80:	learn: 0.3639893	test: 0.3651207	best: 0.3651207 (80)	total: 1m 48s	remaining: 4m 54s
90:	learn: 0.3645989	test: 0.3658567	best: 0.3658567 (90)	total: 2m 2s	remaining: 4m 40s
100:	learn: 0.3666789	test: 0.3663696	best: 0.3664952 (98)	total: 2m 15s	remaining: 4m 27s
110:	learn: 0.3700925	

<catboost.core.CatBoostClassifier at 0x2dc801a20d0>

In [19]:
# Score: 0.37822 (level=1), 0.47373 (level=2), 0.58814 (level=3) - mean=0.48086

In [20]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,cart,24.683438
1,orders_count,21.69053
2,pos_cart_in_recs,10.375611
3,carts_count,7.328423
4,has_cart_in_recs,5.787892
5,user_top_5_cart,5.373141
6,items_count,5.362698
7,user_top_4_cart,2.585103
8,user_median_cart_in_order_count,2.488839
9,user_top_3_cart,2.291925


In [None]:
useless_features = model.get_feature_importance(prettified=True).query("Importances==0")["Feature Id"].tolist()

In [None]:
useless_features

##### Что еще можно попробовать в дальнейшем (*TODO*)

1. Cделать кластеризацию покупателей и делать модельки для каждого кластера.
2. Кластеризовать товары, используя в качестве фичей эмбеддинги, построенные по корзинам покупателей. Это позволит делать рекомендации товаров, которые ранее не покупались данным клиентом.
3. Поработать над фичами, которые отражают взаимодействие конкретного пользователя и конкретного товара.
4. Попробовать использовать датасет при level=2 в качестве обучающей выборки, а датасет при level=1 - в качестве тестовой. (Но необходимо будет решить проблему с категориальными фичами, т.к. catboost не будет давать предикт, если появятся новые значения у категориальной фичи.)