### ML
Суть решения в том, чтобы собрать статистики по пользователю и по предмету и к каждой комбинации этих статистик предсказывать 0 или 1  
Использовался CatBoost - градиентный бустинг над решающими деревьями  
+Также добавлялись сюда рекомендации от TIFU KNN, это дало хороший прирост +0.08 к метрике, в итоге результат около 0.40, как в случае и с TIFU KNN

! Можно запускать в режиме "Run all"

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 3.58 s


### Unpack the data

In [2]:
df = pd.read_csv("data/main.csv")

df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

In [3]:
def duplicates_to_count(t):
    map_table = t.groupby("time")["user_id"].first().to_frame().reset_index()
    count_table = t.groupby("time")["cart"].value_counts().to_frame().rename(columns={"cart":"count"}).reset_index()
    return pd.merge(count_table, map_table, on="time", how="left").reset_index()[["user_id", "time", "cart", "count"]]

def count_to_duplicates(t):
    g = t.copy()
    g["to_explode"] = g["count"].apply(lambda x: [i for i in range(x)])
    g.explode("to_explode").reset_index().drop(columns=["count", "to_explode", "index"])
    return g

def make_train_targets(t, level=1):
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_time["last_buy"] = 1
    
    train = pd.merge(t, user_last_time, on=["time", "user_id"], how="left")
    train = train[train["last_buy"] != 1]
    train.drop(columns=["last_buy"], inplace=True)
    
    if level >= 2:
        return make_train_targets(train, level-1)
    
    user_last_time.drop(columns=["last_buy"], inplace=True)
    
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left")
    
    skeleton = make_skeleton(t)
    targets = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")
    targets.fillna(0, inplace=True)
    targets["count"] = targets["count"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    targets.rename(columns={"count":"target"}, inplace=True)
    return train, targets

def make_skeleton(t):
    return t.groupby("user_id")["cart"].unique().to_frame().reset_index().explode("cart")

# Топ категорий пользователя
def user_top_k_carts(t, k):
    g = count_to_duplicates(t).groupby("user_id")["cart"].value_counts().to_frame().rename(columns={"cart":"count"})\
        .groupby("user_id")["count"].head(k).to_frame().reset_index().drop(columns=["count"])\
        .groupby("user_id")["cart"].agg([lambda x: x.tolist()]).rename(columns={"<lambda>":"user_top_"+str(k)+"_carts"}).reset_index()
    for i in tqdm(range(k)):
        g["user_top_"+str(i+1)+"_cart"] = g["user_top_"+str(k)+"_carts"].apply(lambda x: x[i] if len(x) > i else -1)
    return g.drop(columns=["user_top_"+str(k)+"_carts"]) 

# Кол-во заказов пользователя
def user_orders_count(t):
    return t.groupby(["user_id"])["time"].nunique().to_frame().reset_index().rename(columns={"time":"orders_count"})

# Кол-во вещей пользователя по всем заказам
def user_items_count(t):
    return t.groupby("user_id")["count"].sum().to_frame().reset_index().rename(columns={"count":"items_count"})

# Таблица как в TIFU KNN, сильно грузит память (около 30GB RAM), но прироста большого не дает
def user_pivot_table(t):
    g = pd.pivot_table(t, columns="cart", index="user_id", values="count", aggfunc=np.sum, fill_value=0)
    for cart in list(set(df["cart"].unique()).difference(set(t["cart"].unique()))):
        g[cart] = 0
    return g

# Количество уникальных категорий, в которых покупал пользователь
def user_carts_count(t):
    return t.groupby("user_id")["cart"].nunique().to_frame().reset_index().rename(columns={"cart":"carts_count"})

# Кол-во заказов каждой категории
def cart_orders_count(t):
    return t.groupby("cart")["count"].agg(["sum"]).reset_index().rename(columns={"sum":"cart_orders_count"})

# Кол-во пользователей, которые заказали товар из категории
def cart_unique_users_count(t):
    return t.groupby("cart")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":"cart_unique_users_count"})

# Среднее, мин., макс. и медиана количества товаров каждой категории в корзинеы
def cart_mean_count(t):
    return t.groupby("cart")["count"].agg([("cart_mean_count","mean"), ("cart_min_count", "min"), ("cart_max_count", "max"), ("cart_median_count", "median")]).reset_index()

# Среднее, мин., макс. и медиана размера корзины пользователя за все заказы
def user_mean_cart_in_order_count(t):
    return t.groupby(["user_id","time"])["count"].sum().to_frame().groupby(["user_id"])["count"].agg(["mean", "min", "max", "median"]).reset_index()\
            .rename(columns={"mean":"user_mean_cart_in_order_count", 
                     "min":"user_min_cart_in_order_count", 
                     "max":"user_max_cart_in_order_count", 
                     "median":"user_median_cart_in_order_count"})
    
def make(df, skeleton):
    main = skeleton.copy()
    
    main = pd.merge(main, user_orders_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["orders_count"] = main["orders_count"].astype(int)
    
    main = pd.merge(main, user_items_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["items_count"] = main["items_count"].astype(int)
    
    main = pd.merge(main, user_carts_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["carts_count"] = main["carts_count"].astype(int)
    
    main = pd.merge(main, cart_orders_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_orders_count"] = main["cart_orders_count"].astype(int)
    
    main = pd.merge(main, cart_unique_users_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_unique_users_count"] = main["cart_unique_users_count"].astype(int)
    
    main = pd.merge(main, user_mean_cart_in_order_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    
    main = pd.merge(main, cart_mean_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    
    top_k = 5
    main = pd.merge(main, user_top_k_carts(df, top_k), on="user_id", how="left")
    main.fillna(-1, inplace=True)
    for i in range(top_k):
        main["user_top_"+str(i+1)+"_cart"] = main["user_top_"+str(i+1)+"_cart"].astype(int)
    

    return main

In [4]:
df = duplicates_to_count(df)

In [5]:
train_1, targets_1 = make_train_targets(df, level=1)

main_1 = make(train_1, targets_1)

main_1.head()

100%|████████████████████████████████████████████████████████████████████| 1120884/1120884 [00:02<00:00, 521374.44it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 69.44it/s]


Unnamed: 0,user_id,cart,target,orders_count,items_count,carts_count,cart_orders_count,cart_unique_users_count,user_mean_cart_in_order_count,user_min_cart_in_order_count,...,user_median_cart_in_order_count,cart_mean_count,cart_min_count,cart_max_count,cart_median_count,user_top_1_cart,user_top_2_cart,user_top_3_cart,user_top_4_cart,user_top_5_cart
0,0,14,0,2,33,27,85084,15268,16.5,8.0,...,16.5,1.002616,1.0,2.0,1.0,14,57,82,379,405
1,0,20,0,2,33,27,13654,5922,16.5,8.0,...,16.5,1.000366,1.0,2.0,1.0,14,57,82,379,405
2,0,57,1,2,33,27,98697,16325,16.5,8.0,...,16.5,1.00311,1.0,3.0,1.0,14,57,82,379,405
3,0,82,0,2,33,27,24952,7522,16.5,8.0,...,16.5,1.000441,1.0,2.0,1.0,14,57,82,379,405
4,0,379,0,2,33,27,19729,8462,16.5,8.0,...,16.5,1.000406,1.0,2.0,1.0,14,57,82,379,405


In [6]:
recs_1 = pd.read_csv("data/train_lvl_1_recs.csv")

In [7]:
recs_1.head()

Unnamed: 0,user_id,items,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,"[57, 14, 82, 84, 430, 22, 409, 61, 379, 441, 3...",57,14,82,84,430,22,409,61,379,441,382,41,5,383,398
1,1,"[55, 798, 812, 169, 14, 88, 170, 171, 198, 23,...",55,798,812,169,14,88,170,171,198,23,404,19,406,57,61
2,2,"[57, 61, 23, 14, 409, 84, 82, 425, 398, 22, 43...",57,61,23,14,409,84,82,425,398,22,430,403,384,382,16
3,3,"[61, 57, 84, 398, 16, 430, 14, 22, 383, 382, 3...",61,57,84,398,16,430,14,22,383,382,399,41,43,23,402
4,4,"[57, 61, 398, 84, 54, 22, 712, 17, 100, 388, 4...",57,61,398,84,54,22,712,17,100,388,420,383,425,16,14


In [8]:
main_1 = pd.merge(main_1, recs_1, on="user_id", how="left")

In [9]:
main_array = main_1.to_numpy()
arr_has = []
arr_pos = []
index_of_items = main_1.columns.tolist().index("items")
for row in tqdm(main_array):
    v = row[index_of_items].replace(']','').replace('[','')
    v = v.split(", ")
    for i in range(len(v)):
        v[i] = int(v[i])
    if row[1] in v:
        arr_has.append(1)
        arr_pos.append(v.index(row[1]))
    else:
        arr_has.append(0)
        arr_pos.append(-1)
        
main_1["has_cart_in_recs"] = arr_has
main_1["pos_cart_in_recs"] = arr_pos

100%|█████████████████████████████████████████████████████████████████████| 1120884/1120884 [00:15<00:00, 71799.38it/s]


In [10]:
main_1.drop(columns=["items"], inplace=True)

Далее можно расширить обучающую выборку, добавив к ней датасет в таком же формате, но собраный при level=2, например

In [11]:
main_final = main_1 # or pd.concat([main_1, main_2])

In [12]:
main_final.shape

(1120884, 38)

In [13]:
top_k = 5
cat_features = ["has_cart_in_recs"] + ["user_top_"+str(i+1)+"_cart" for i in range(top_k)] + [str(i) for i in range(15)]
cat_features

['has_cart_in_recs',
 'user_top_1_cart',
 'user_top_2_cart',
 'user_top_3_cart',
 'user_top_4_cart',
 'user_top_5_cart',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14']

In [14]:
x_train, x_validation, y_train, y_validation = train_test_split(main_final.drop(columns=["user_id", "target"]), 
                                                                main_final["target"], 
                                                                stratify=main_final["target"],
                                                                test_size=0.33, 
#                                                                 random_state=42
                                                               )

In [15]:
model = CatBoostClassifier(iterations=300,
#                             depth = 6,
                            learning_rate = 0.35,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            # leaf_estimation_iterations = 10,
                            # max_ctr_complexity = 1,
                            random_seed= 127,
                            cat_features = cat_features
                           )

In [16]:
model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
          early_stopping_rounds=50,  
#           plot=True, 
          verbose=10
          )

0:	learn: 0.1496443	test: 0.1768834	best: 0.1768834 (0)	total: 1.18s	remaining: 5m 51s
10:	learn: 0.2764308	test: 0.3070420	best: 0.3070420 (10)	total: 12.9s	remaining: 5m 37s
20:	learn: 0.3514155	test: 0.3665220	best: 0.3665220 (20)	total: 27.4s	remaining: 6m 3s
30:	learn: 0.3598710	test: 0.3732818	best: 0.3751456 (29)	total: 41.4s	remaining: 5m 59s
40:	learn: 0.3700871	test: 0.3805678	best: 0.3815778 (39)	total: 54.9s	remaining: 5m 46s
50:	learn: 0.3753164	test: 0.3834059	best: 0.3834059 (50)	total: 1m 7s	remaining: 5m 29s
60:	learn: 0.3783242	test: 0.3859450	best: 0.3868223 (56)	total: 1m 20s	remaining: 5m 17s
70:	learn: 0.3801099	test: 0.3867905	best: 0.3874881 (65)	total: 1m 33s	remaining: 5m 2s
80:	learn: 0.3855442	test: 0.3917870	best: 0.3917870 (80)	total: 1m 46s	remaining: 4m 48s
90:	learn: 0.3855387	test: 0.3909888	best: 0.3917870 (80)	total: 1m 58s	remaining: 4m 32s
100:	learn: 0.3867936	test: 0.3921875	best: 0.3921875 (100)	total: 2m 11s	remaining: 4m 19s
110:	learn: 0.3889

<catboost.core.CatBoostClassifier at 0x2710440a250>

In [17]:
# Score: 0.39940 (level=1), 0.48547 (level=2), 0.59612 (level=3) - mean=0.49366
# 0.44531 (level=1;2)

In [18]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,pos_cart_in_recs,14.380365
1,cart_orders_count,14.316317
2,user_top_5_cart,9.501397
3,13,8.654127
4,14,7.315899
5,carts_count,6.359754
6,orders_count,5.91888
7,user_top_4_cart,4.829065
8,has_cart_in_recs,3.870371
9,cart_unique_users_count,3.793756


In [None]:
useless_features = model.get_feature_importance(prettified=True).query("Importances==0")["Feature Id"].tolist()