## Typical baselines
Проверим три основных идеи:
* Порекомендуем пользователю его посленюю покупку
    - Local score: 0.40049
    - Public score: 0.39830
* Для каждого пользователя найдем самые часто покупаемые товары и порекомендуем их
    - Local score: 0.41843
    - Public score: 0.42430
* Каждому пользователю порекомендуем просто топ популярных товаров
    - Local score: 0.39849
    - Public score: 0.38834

In [1]:
%%time
%pylab inline
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
import pickle
import gc
tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib
Wall time: 1.6 s


In [2]:
df = pd.read_csv("train.csv")
ss = pd.read_csv("sample_submission.csv")

In [3]:
df.rename(columns={"order_completed_at":"time"}, inplace=True) # rename "order_completed_at" column to "time"
df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S") # "time" column to datetime type

In [4]:
# Split "id" column to "user_id" and "cart" columns
ss["user_id"] = ss.id.progress_apply(lambda x: x.split(";")[0]).astype(int)
ss["cart"] = ss.id.progress_apply(lambda x: x.split(";")[1]).astype(int)
ss = ss[["user_id", "cart", "target"]] 

100%|█████████████████████████████████████████████████████████████████████| 790449/790449 [00:00<00:00, 1086835.28it/s]
100%|██████████████████████████████████████████████████████████████████████| 790449/790449 [00:01<00:00, 694126.19it/s]


In [5]:
def duplicates_to_count(t): #Заменяет повторяющиеся товары в корзине на одну запись с указанием количества
    map_table = t.groupby("time")["user_id"].first().to_frame().reset_index()
    count_table = t.groupby("time")["cart"].value_counts().to_frame().rename(columns={"cart":"count"}).reset_index()
    return pd.merge(count_table, map_table, on="time", how="left").reset_index()[["user_id", "time", "cart", "count"]]

def count_to_duplicates(t): #Обратная операция
    g = t.copy()
    g["to_explode"] = g["count"].apply(lambda x: [i for i in range(x)])
    g.explode("to_explode").reset_index().drop(columns=["count", "to_explode", "index"], inplace=True)
    return g

def to_required_df(df): # [user_id, cart, target] -> [id, target]
    df["id"] = df["user_id"].astype(str) + ";" + df["cart"].astype(str)
    df.drop(columns=["user_id", "cart"], inplace=True)
    df.rename(columns={"predict":"target"}, inplace=True)
    return df[["id", "target"]]

def make_train_targets(t, level=1):# Создает train датасет и ответы для него, lvevel=1 - последняя покупка, level=2 - предпоследняя... 
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_time["last_buy"] = 1
    
    train = pd.merge(t, user_last_time, on=["time", "user_id"], how="left")
    train = train[train["last_buy"] != 1]
    train.drop(columns=["last_buy"], inplace=True)
    
    user_last_time.drop(columns=["last_buy"], inplace=True)
    
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left")
    
    skeleton = make_skeleton(train)
    targets = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")
    targets.fillna(0, inplace=True)
    targets["count"] = targets["count"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    targets.rename(columns={"count":"target"}, inplace=True)
    return train, targets

def make_skeleton(t): # Создает все комбинации user_id и cart, которые пользователь когда-либо покупал
    return t.groupby("user_id")["cart"].unique().to_frame().reset_index().explode("cart")
    

In [6]:
df = duplicates_to_count(df)

In [7]:
df.head()

Unnamed: 0,user_id,time,cart,count
0,2,2015-03-22 09:25:46,14,1
1,2,2015-03-22 09:25:46,16,1
2,2,2015-03-22 09:25:46,23,1
3,2,2015-03-22 09:25:46,57,1
4,2,2015-03-22 09:25:46,82,1


### User's last cart

In [8]:
def user_last_cart_baseline(t, skeleton):
    user_last_time = t.groupby(["user_id"])["time"].last().to_frame().reset_index()
    user_last_carts = pd.merge(user_last_time, df.drop(columns=["user_id"]), on="time", how="left") 
    res = pd.merge(skeleton, user_last_carts.drop(columns=["time"]), on=["user_id","cart"], how="left")\
        .fillna(0).rename(columns={"count":"predict"})
    res["predict"] = res["predict"].progress_apply(lambda x: x if x <= 1 else 1).astype(int)
    return res

In [9]:
train, targets = make_train_targets(df)

100%|███████████████████████████████████████████████████████████████████| 1032983/1032983 [00:00<00:00, 1194334.30it/s]


In [10]:
train.head()

Unnamed: 0,user_id,time,cart,count
0,2,2015-03-22 09:25:46,14,1
1,2,2015-03-22 09:25:46,16,1
2,2,2015-03-22 09:25:46,23,1
3,2,2015-03-22 09:25:46,57,1
4,2,2015-03-22 09:25:46,82,1


In [11]:
targets.head()

Unnamed: 0,user_id,cart,target
0,0,14,0
1,0,20,0
2,0,57,1
3,0,82,0
4,0,379,0


In [12]:
train_res = user_last_cart_baseline(train, make_skeleton(train))

100%|███████████████████████████████████████████████████████████████████| 1032983/1032983 [00:00<00:00, 1215581.46it/s]


In [13]:
train_res.head()

Unnamed: 0,user_id,cart,predict
0,0,14,1
1,0,20,0
2,0,57,1
3,0,82,1
4,0,379,1


In [14]:
train_res = pd.merge(train_res, targets, on=["user_id", "cart"], how="left")

In [15]:
f1_score(train_res["target"], train_res["predict"])

0.40049281930584185

In [16]:
res = user_last_cart_baseline(df, ss[["user_id","cart"]])

100%|█████████████████████████████████████████████████████████████████████| 790449/790449 [00:00<00:00, 1155719.82it/s]


In [17]:
res.head()

Unnamed: 0,user_id,cart,predict
0,0,133,0
1,0,5,0
2,0,10,0
3,0,396,0
4,0,14,0


In [18]:
res = to_required_df(res)

In [19]:
res.head()

Unnamed: 0,id,target
0,0;133,0
1,0;5,0
2,0;10,0
3,0;396,0
4,0;14,0


In [20]:
res.to_csv("last_order_baseline.csv", index=False)

In [21]:
# Local score: 0.40049
# Public score: 0.39830

### User's top personal recommendations

In [22]:
def user_top_k_personal_rec_baseline(t, skeleton, k=10):
    g = count_to_duplicates(t).groupby("user_id")["cart"].value_counts().to_frame().rename(columns={"cart":"count"})\
        .groupby("user_id")["count"].head(k).to_frame().reset_index().drop(columns=["count"])\
        .groupby("user_id")["cart"].agg([lambda x: x.tolist()]).rename(columns={"<lambda>":"user_top_"+str(k)+"_carts"}).reset_index()
    res = pd.merge(skeleton, g, on=["user_id"], how="left")
    arr = []
    for index, row in tqdm(res.iterrows(), total=res.shape[0]):
        if row["cart"] in row["user_top_"+str(k)+"_carts"]:
            arr.append(1)
        else:
            arr.append(0)
    res["predict"] = arr
    res.drop(columns=["user_top_10_carts"], inplace=True)
    return res

In [23]:
train, targets = make_train_targets(df)

100%|████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:01<00:00, 869690.44it/s]


In [24]:
train_res = user_top_k_personal_rec_baseline(train, make_skeleton(train))

100%|█████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:41<00:00, 24674.92it/s]


In [25]:
train_res = pd.merge(train_res, targets, on=["user_id", "cart"], how="left")

In [26]:
f1_score(train_res["target"], train_res["predict"])

0.4184343601859459

In [27]:
res = user_top_k_personal_rec_baseline(df, ss[["user_id","cart"]])

100%|███████████████████████████████████████████████████████████████████████| 790449/790449 [00:33<00:00, 23784.56it/s]


In [28]:
res.head()

Unnamed: 0,user_id,cart,predict
0,0,133,0
1,0,5,1
2,0,10,1
3,0,396,0
4,0,14,1


In [29]:
res = to_required_df(res)

In [30]:
res.to_csv("user_top_10_rec_baseline.csv", index=False)

In [31]:
# Local score: 0.41843
# Public score: 0.42430

### Top popular items

In [32]:
def top_k_popular_baseline(t, skeleton, k=15):
    top_k_items = t.groupby("cart")["count"].sum().to_frame().reset_index().sort_values("count", ascending=False).head(k)["cart"].tolist()
    g = pd.DataFrame()
    g["user_id"] = [i for i in range(20000)]
    g["top_"+str(k)+"_items"] = [top_k_items for i in range(20000)]
    
    res = pd.merge(skeleton, g, on=["user_id"], how="left")
    
    arr = []
    for index, row in tqdm(res.iterrows(), total=res.shape[0]):
        if row["cart"] in row["top_"+str(k)+"_items"]:
            arr.append(1)
        else:
            arr.append(0)
    res["predict"] = arr
#     res.drop(columns=["top_15_items"], inplace=True)
    return res

In [33]:
train, targets = make_train_targets(df)

100%|████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:01<00:00, 972454.75it/s]


In [34]:
train_res = top_k_popular_baseline(train, make_skeleton(train))

100%|█████████████████████████████████████████████████████████████████████| 1032983/1032983 [00:46<00:00, 22091.11it/s]


In [35]:
train_res = pd.merge(train_res, targets, on=["user_id", "cart"], how="left")

In [36]:
f1_score(train_res["target"], train_res["predict"])

0.39849435127911614

In [37]:
res = user_top_k_personal_rec_baseline(df, ss[["user_id","cart"]])

100%|███████████████████████████████████████████████████████████████████████| 790449/790449 [00:33<00:00, 23299.45it/s]


In [38]:
res = to_required_df(res)

In [39]:
res.to_csv("user_top_10_rec_baseline.csv", index=False)

In [40]:
# Local score: 0.39849
# Public score: 0.38834