### ML
Суть решения в том, чтобы собрать статистики по пользователю и по предмету и к каждой комбинации этих статистик предсказывать 0 или 1  
Использовался CatBoost - градиентный бустинг над решающими деревьями  
+Также добавлялись сюда рекомендации от TIFU KNN, это дало хороший прирост +0.06 к метрике, но в итоге максимум 0.36  
Основное обучение происходило на Колабе (colab.research.google.com)

In [None]:
!pip install catboost==0.25.1 # В 0.26 обучение на GPU умирает

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pickle
import gc
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 2.8 s


### Unpack the data

In [2]:
df = pd.read_csv("train.csv")
ss = pd.read_csv("sample_submission.csv")

df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

# Split "id" column to "user_id" and "cart" columns
ss["user_id"] = ss.id.progress_apply(lambda x: x.split(";")[0]).astype(int)
ss["cart"] = ss.id.progress_apply(lambda x: x.split(";")[1]).astype(int)
ss = ss[["user_id", "cart", "target"]] 

100%|██████████████████████████████████████████████████████████████████████| 790449/790449 [00:01<00:00, 613181.38it/s]
100%|██████████████████████████████████████████████████████████████████████| 790449/790449 [00:01<00:00, 564831.28it/s]


In [3]:
def duplicates_to_count(t):
    map_table = t.groupby("time")["user_id"].first().to_frame().reset_index()
    count_table = t.groupby("time")["cart"].value_counts().to_frame().rename(columns={"cart":"count"}).reset_index()
    return pd.merge(count_table, map_table, on="time", how="left").reset_index()[["user_id", "time", "cart", "count"]]

def count_to_duplicates(t):
    g = t.copy()
    g["to_explode"] = g["count"].apply(lambda x: [i for i in range(x)])
    g.explode("to_explode").reset_index().drop(columns=["count", "to_explode", "index"])
    return g

def to_required_df(df): # [user_id, cart, target] -> [id, target]
    df["id"] = df["user_id"].astype(str) + ";" + df["cart"].astype(str)
    df.drop(columns=["user_id", "cart"], inplace=True)
    df.rename(columns={"predict":"target"}, inplace=True)
    return df[["id", "target"]]

def make_train_targets(t, level=1):
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_time["last_buy"] = 1
    
    train = pd.merge(t, user_last_time, on=["time", "user_id"], how="left")
    train = train[train["last_buy"] != 1]
    train.drop(columns=["last_buy"], inplace=True)
    
    if level >= 2:
        return make_train_targets(train, level-1)
    
    user_last_time.drop(columns=["last_buy"], inplace=True)
    
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left")
    
    skeleton = make_skeleton(train)
    targets = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")
    targets.fillna(0, inplace=True)
    targets["count"] = targets["count"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    targets.rename(columns={"count":"target"}, inplace=True)
    return train, targets

def make_skeleton(t):
    return t.groupby("user_id")["cart"].unique().to_frame().reset_index().explode("cart")

# Топ категорий пользователя
def user_top_k_carts(t, k):
    g = count_to_duplicates(t).groupby("user_id")["cart"].value_counts().to_frame().rename(columns={"cart":"count"})\
        .groupby("user_id")["count"].head(k).to_frame().reset_index().drop(columns=["count"])\
        .groupby("user_id")["cart"].agg([lambda x: x.tolist()]).rename(columns={"<lambda>":"user_top_"+str(k)+"_carts"}).reset_index()
    for i in tqdm(range(k)):
        g["user_top_"+str(i+1)+"_cart"] = g["user_top_"+str(k)+"_carts"].apply(lambda x: x[i] if len(x) > i else -1)
    return g.drop(columns=["user_top_"+str(k)+"_carts"]) 

# Кол-во заказов пользователя
def user_orders_count(t):
    return t.groupby(["user_id"])["time"].nunique().to_frame().reset_index().rename(columns={"time":"orders_count"})

# Кол-во вещей пользователя по всем заказам
def user_items_count(t):
    return t.groupby("user_id")["count"].sum().to_frame().reset_index().rename(columns={"count":"items_count"})

# Таблица как в TIFU KNN, сильно грузит память (около 30GB RAM), но прироста большого не дает
def user_pivot_table(t):
    g = pd.pivot_table(t, columns="cart", index="user_id", values="count", aggfunc=np.sum, fill_value=0)
    for cart in list(set(df["cart"].unique()).difference(set(t["cart"].unique()))):
        g[cart] = 0
    return g

# Количество уникальных категорий, в которых покупал пользователь
def user_carts_count(t):
    return t.groupby("user_id")["cart"].nunique().to_frame().reset_index().rename(columns={"cart":"carts_count"})

# Кол-во заказов каждой категории
def cart_orders_count(t):
    return t.groupby("cart")["count"].agg(["sum"]).reset_index().rename(columns={"sum":"cart_orders_count"})

# Кол-во пользователей, которые заказали товар из категории
def cart_unique_users_count(t):
    return t.groupby("cart")["user_id"].nunique().to_frame().reset_index().rename(columns={"user_id":"cart_unique_users_count"})

# Среднее, мин., макс. и медиана количества товаров каждой категории в корзинеы
def cart_mean_count(t):
    return t.groupby("cart")["count"].agg([("cart_mean_count","mean"), ("cart_min_count", "min"), ("cart_max_count", "max"), ("cart_median_count", "median")]).reset_index()

# Среднее, мин., макс. и медиана размера корзины пользователя за все заказы
def user_mean_cart_in_order_count(t):
    return t.groupby(["user_id","time"])["count"].sum().to_frame().groupby(["user_id"])["count"].agg(["mean", "min", "max", "median"]).reset_index()\
            .rename(columns={"mean":"user_mean_cart_in_order_count", 
                     "min":"user_min_cart_in_order_count", 
                     "max":"user_max_cart_in_order_count", 
                     "median":"user_median_cart_in_order_count"})

def to_required_df(df): # [user_id, cart, target] -> [id, target]
    df["id"] = df["user_id"].astype(str) + ";" + df["cart"].astype(str)
    df.drop(columns=["user_id", "cart"], inplace=True)
    return df
    
def make(df, skeleton):
    main = skeleton.copy()
    
    main = pd.merge(main, user_orders_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["orders_count"] = main["orders_count"].astype(int)
    
    main = pd.merge(main, user_items_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["items_count"] = main["items_count"].astype(int)
    
    main = pd.merge(main, user_carts_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    main["carts_count"] = main["carts_count"].astype(int)
    
    main = pd.merge(main, cart_orders_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_orders_count"] = main["cart_orders_count"].astype(int)
    
    main = pd.merge(main, cart_unique_users_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    main["cart_unique_users_count"] = main["cart_unique_users_count"].astype(int)
    
    main = pd.merge(main, user_mean_cart_in_order_count(df), on="user_id", how="left")
    main.fillna(0, inplace=True)
    
    main = pd.merge(main, cart_mean_count(df), on="cart", how="left")
    main.fillna(0, inplace=True)
    
    top_k = 5
    main = pd.merge(main, user_top_k_carts(df, top_k), on="user_id", how="left")
    main.fillna(-1, inplace=True)

    return main
    

In [4]:
df = duplicates_to_count(df)

In [5]:
train, targets = make_train_targets(df, level=1)

100%|████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:01<00:00, 736266.35it/s]


In [6]:
main = make(train, make_skeleton(train)) # of ss[["user_id", "cart"]] instead of make_skeleton(train) in test case

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 86.36it/s]


In [7]:
main = pd.merge(main, targets, on=["user_id", "cart"], how="left")

In [8]:
recs = pd.read_csv("train_recs.csv")

In [9]:
recs.head()

Unnamed: 0,user_id,items,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0,"[57, 14, 82, 84, 430, 22, 409, 61, 379, 441, 3...",57,14,82,84,430,22,409,61,379,441,382,41,5,383,398
1,1,"[55, 798, 812, 169, 14, 88, 170, 171, 198, 23,...",55,798,812,169,14,88,170,171,198,23,404,19,406,57,61
2,2,"[57, 61, 23, 14, 409, 84, 82, 425, 398, 22, 43...",57,61,23,14,409,84,82,425,398,22,430,403,384,382,16
3,3,"[61, 57, 84, 398, 16, 430, 14, 22, 383, 382, 3...",61,57,84,398,16,430,14,22,383,382,399,41,43,23,402
4,4,"[57, 61, 398, 84, 54, 22, 712, 17, 100, 388, 4...",57,61,398,84,54,22,712,17,100,388,420,383,425,16,14


In [10]:
main = pd.merge(main, recs, on="user_id", how="left")

In [11]:
main_array = main.to_numpy()
arr_has = []
arr_pos = []
for row in tqdm(main_array):
    v = row[21].replace(']','').replace('[','')
    v = v.split(", ")
    for i in range(len(v)):
        v[i] = int(v[i])
    if row[1] in v:
        arr_has.append(1)
        arr_pos.append(v.index(row[1]))
    else:
        arr_has.append(0)
        arr_pos.append(-1)

100%|████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:09<00:00, 107952.25it/s]


In [12]:
main["has_cart_in_recs"] = arr_has
main["pos_cart_in_recs"] = arr_pos

In [13]:
main.head()

Unnamed: 0,user_id,cart,orders_count,items_count,carts_count,cart_orders_count,cart_unique_users_count,user_mean_cart_in_order_count,user_min_cart_in_order_count,user_max_cart_in_order_count,...,7,8,9,10,11,12,13,14,has_cart_in_recs,pos_cart_in_recs
0,0,14,2,33,27,85084,15268,16.5,8,25,...,61,379,441,382,41,5,383,398,1,1
1,0,20,2,33,27,13654,5922,16.5,8,25,...,61,379,441,382,41,5,383,398,0,-1
2,0,57,2,33,27,98697,16325,16.5,8,25,...,61,379,441,382,41,5,383,398,1,0
3,0,82,2,33,27,24952,7522,16.5,8,25,...,61,379,441,382,41,5,383,398,1,2
4,0,379,2,33,27,19729,8462,16.5,8,25,...,61,379,441,382,41,5,383,398,1,8


In [14]:
main.drop(columns=["items"], inplace=True)

In [15]:
main_final = main # pd.concat([main, main2])

In [16]:
main_final.shape

(1032983, 38)

In [17]:
top_k = 5
cat_features = ["has_cart_in_recs"] + ["user_top_"+str(i+1)+"_cart" for i in range(top_k)] + [str(i) for i in range(15)]
cat_features

['has_cart_in_recs',
 'user_top_1_cart',
 'user_top_2_cart',
 'user_top_3_cart',
 'user_top_4_cart',
 'user_top_5_cart',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14']

In [18]:
x_train, x_validation, y_train, y_validation = train_test_split(main_final.drop(columns=["user_id", "target"]), 
                                                                main_final["target"], 
                                                                stratify=main_final["target"],
                                                                test_size=0.33, 
#                                                                 random_state=42
                                                               )

In [19]:
model = CatBoostClassifier(iterations=200,
#                             depth = 6,
                            learning_rate = 0.35,
#                             l2_leaf_reg = 4,
                            eval_metric="F1",
                            loss_function = "Logloss",
                            task_type="GPU",
                            # fold_permutation_block = 2,
                            # fold_len_multiplier = 1.5,
                            # leaf_estimation_iterations = 10,
                            # max_ctr_complexity = 1,
                            random_seed= 127,
                            cat_features = cat_features
                           )

In [20]:
model.fit(x_train, 
          y_train, 
          eval_set=(x_validation, y_validation), 
          use_best_model=True, 
          early_stopping_rounds=50,  
#           plot=True, 
          verbose=10
          )

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 1.1s	remaining: 3m 38s
10:	learn: 0.2404335	test: 0.2664657	best: 0.2664657 (10)	total: 12.4s	remaining: 3m 33s
20:	learn: 0.3092925	test: 0.3268165	best: 0.3268165 (20)	total: 24.7s	remaining: 3m 30s
30:	learn: 0.3198516	test: 0.3365685	best: 0.3365685 (30)	total: 37.7s	remaining: 3m 25s
40:	learn: 0.3279398	test: 0.3437308	best: 0.3439760 (36)	total: 49.9s	remaining: 3m 13s
50:	learn: 0.3308267	test: 0.3463728	best: 0.3474751 (48)	total: 1m 2s	remaining: 3m 2s
60:	learn: 0.3342671	test: 0.3482946	best: 0.3484404 (58)	total: 1m 14s	remaining: 2m 48s
70:	learn: 0.3361771	test: 0.3506968	best: 0.3508443 (66)	total: 1m 25s	remaining: 2m 35s
80:	learn: 0.3382296	test: 0.3530393	best: 0.3530393 (80)	total: 1m 37s	remaining: 2m 23s
90:	learn: 0.3394331	test: 0.3551270	best: 0.3561118 (85)	total: 1m 49s	remaining: 2m 10s
100:	learn: 0.3411626	test: 0.3567872	best: 0.3569659 (97)	total: 2m 1s	remaining: 1m 59s
110:	learn: 0.341740

<catboost.core.CatBoostClassifier at 0x2316794d760>

In [21]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,cart_orders_count,22.41628
1,pos_cart_in_recs,11.012335
2,user_top_5_cart,7.61843
3,13,7.201988
4,14,5.979828
5,user_median_cart_in_order_count,5.613128
6,has_cart_in_recs,5.547244
7,cart_unique_users_count,5.310661
8,cart_mean_count,4.628495
9,cart,4.394014


In [None]:
useless_features = model.get_feature_importance(prettified=True).query("Importances==0")["Feature Id"].tolist()

In [None]:
useless_features