## Typical baselines
Проверим три основных идеи:
* Порекомендуем пользователю его посленюю покупку
    - Score: 0.32772 (mean) - 0.33847 (level=1), 0.33623 (level=2), 0.30845 (level=3), 
* Для каждого пользователя найдем самые часто покупаемые товары и порекомендуем их
    - Score: 0.32311 (mean) - 0.34178 (level=1), 0.33146 (level=2), 0.29609 (level=3)
* Каждому пользователю порекомендуем просто топ популярных товаров
    - Score: 0.37599 (mean) - 0.36759 (level=1), 0.37323 (level=2), 0.38714 (level=3)
    
! Можно использовать режим "Run all"

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 1.79 s


In [2]:
df = pd.read_csv("data/main.csv")

In [3]:
df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

In [4]:
def duplicates_to_count(t): #Заменяет повторяющиеся товары в корзине на одну запись с указанием количества
    map_table = t.groupby("time")["user_id"].first().to_frame().reset_index()
    count_table = t.groupby("time")["cart"].value_counts().to_frame().rename(columns={"cart":"count"}).reset_index()
    return pd.merge(count_table, map_table, on="time", how="left").reset_index()[["user_id", "time", "cart", "count"]]

def count_to_duplicates(t): #Обратная операция
    g = t.copy()
    g["to_explode"] = g["count"].apply(lambda x: [i for i in range(x)])
    g.explode("to_explode").reset_index().drop(columns=["count", "to_explode", "index"], inplace=True)
    return g

# Создает train датасет и ответы для него
# lvevel=1 означает, что для формирования ответов берётся последняя покупка, level=2 - предпоследняя и так далее 
def make_train_targets(t, level=1): 
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_time["last_buy"] = 1
    
    train = pd.merge(t, user_last_time, on=["time", "user_id"], how="left")
    train = train[train["last_buy"] != 1]
    train.drop(columns=["last_buy"], inplace=True)
    
    if level >= 2:
        return make_train_targets(train, level-1)
    
    user_last_time.drop(columns=["last_buy"], inplace=True)
    
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left")
    
    skeleton = make_skeleton(t)
    targets = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")
    targets.fillna(0, inplace=True)
    targets["count"] = targets["count"].apply(lambda x: x if x <= 1 else 1).astype(int)
    targets.rename(columns={"count":"target"}, inplace=True)
    return train, targets

def make_skeleton(t): # Создает все комбинации user_id и cart, которые пользователь когда-либо покупал
    return t.groupby("user_id")["cart"].unique().to_frame().reset_index().explode("cart")
    

In [5]:
df = duplicates_to_count(df)

In [6]:
df.head()

Unnamed: 0,user_id,time,cart,count
0,2,2015-03-22 09:25:46,14,1
1,2,2015-03-22 09:25:46,16,1
2,2,2015-03-22 09:25:46,23,1
3,2,2015-03-22 09:25:46,57,1
4,2,2015-03-22 09:25:46,82,1


### User's last cart

In [7]:
def user_last_cart_baseline(t, skeleton):
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left") 
    res = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")\
        .fillna(0).rename(columns={"count":"predict"})
    res["predict"] = res["predict"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    return res

In [8]:
train, targets = make_train_targets(df, level=1)

In [9]:
train.head()

Unnamed: 0,user_id,time,cart,count
0,2,2015-03-22 09:25:46,14,1
1,2,2015-03-22 09:25:46,16,1
2,2,2015-03-22 09:25:46,23,1
3,2,2015-03-22 09:25:46,57,1
4,2,2015-03-22 09:25:46,82,1


In [10]:
targets.head()

Unnamed: 0,user_id,cart,target
0,0,14,0
1,0,20,0
2,0,57,1
3,0,82,0
4,0,379,0


In [11]:
train_res = user_last_cart_baseline(train, targets)

100%|████████████████████████████████████████████████████████████████████| 1120884/1120884 [00:01<00:00, 861338.61it/s]


In [12]:
train_res.head()

Unnamed: 0,user_id,cart,target,predict
0,0,14,0,1
1,0,20,0,0
2,0,57,1,1
3,0,82,0,1
4,0,379,0,1


In [13]:
f1_score(train_res["target"], train_res["predict"])

0.3384696700082806

### User's top personal recommendations

In [14]:
def user_top_k_personal_rec_baseline(t, skeleton, k=10):
    g = count_to_duplicates(t).groupby("user_id")["cart"].value_counts().to_frame().rename(columns={"cart":"count"})\
        .groupby("user_id")["count"].head(k).to_frame().reset_index().drop(columns=["count"])\
        .groupby("user_id")["cart"].agg([lambda x: x.tolist()]).rename(columns={"<lambda>":"user_top_"+str(k)+"_carts"}).reset_index()
    res = pd.merge(skeleton, g, on="user_id", how="left")
    arr = []
    index_of_cart = res.columns.tolist().index("cart")
    index_of_recs = res.columns.tolist().index("user_top_"+str(k)+"_carts")
    res_array = res.to_numpy()
    for row in tqdm(res_array):
        if type(row[index_of_recs]) == float: #if nan
            arr.append(0)
            continue
        if row[index_of_cart] in row[index_of_recs]:
            arr.append(1)
        else:
            arr.append(0)
    res["predict"] = arr
    res.drop(columns=["user_top_10_carts"], inplace=True)
    return res

In [15]:
train, targets = make_train_targets(df, level=1)

In [16]:
train_res = user_top_k_personal_rec_baseline(train, targets)

100%|████████████████████████████████████████████████████████████████████| 1120884/1120884 [00:01<00:00, 958706.37it/s]


In [17]:
train_res.head()

Unnamed: 0,user_id,cart,target,predict
0,0,14,0,1
1,0,20,0,1
2,0,57,1,1
3,0,82,0,1
4,0,379,0,1


In [18]:
f1_score(train_res["target"], train_res["predict"])

0.34178262473350657

### Top popular items

In [19]:
def top_k_popular_baseline(t, skeleton, k=15):
    top_k_items = t.groupby("cart")["count"].sum().to_frame().reset_index().sort_values("count", ascending=False).head(k)["cart"].tolist()
    g = pd.DataFrame()
    g["user_id"] = [i for i in range(20000)]
    g["top_"+str(k)+"_items"] = [top_k_items for i in range(20000)]
    
    res = pd.merge(skeleton, g, on=["user_id"], how="left")
    
    arr = []
    index_of_cart = res.columns.tolist().index("cart")
    index_of_recs = res.columns.tolist().index("top_"+str(k)+"_items")
    res_array = res.to_numpy()
    for row in tqdm(res_array):
        if type(row[index_of_recs]) == float: #if nan
            arr.append(0)
            continue
        if row[index_of_cart] in row[index_of_recs]:
            arr.append(1)
        else:
            arr.append(0)
    res["predict"] = arr
    res.drop(columns=["top_"+str(k)+"_items"], inplace=True)
    return res

In [20]:
train, targets = make_train_targets(df, level=1)

In [21]:
train_res = top_k_popular_baseline(train, targets)

100%|███████████████████████████████████████████████████████████████████| 1120884/1120884 [00:01<00:00, 1081173.97it/s]


In [22]:
f1_score(train_res["target"], train_res["predict"])

0.36759007646407765