## Данные и матрица интеракций

In [39]:
import pandas as pd
import numpy as np

data = pd.read_csv('recipes_normalized.csv')

data.head(10)

Unnamed: 0,url,name,ingredients,ingredients_normalized
0,https://www.povarenok.ru/recipes/show/164365/,Густой молочно-клубничный коктейль,"{'Молоко': '250 мл', 'Клубника': '200 г', 'Сах...","{'Молоко': '250 мл', 'Клубника': '200 г', 'Сах..."
1,https://www.povarenok.ru/recipes/show/1306/,Рулетики,"{'Сыр твердый': None, 'Чеснок': None, 'Яйцо ку...","{'Сыр твердый': None, 'Чеснок': None, 'Яйцо ку..."
2,https://www.povarenok.ru/recipes/show/10625/,"Салат ""Баклажанчик""","{'Баклажан': '3 шт', 'Лук репчатый': '2 шт', '...","{'Баклажан': '3 шт', 'Лук репчатый': '2 шт', '..."
3,https://www.povarenok.ru/recipes/show/167337/,Куриные котлеты с картофельным пюре в духовке,"{'Фарш куриный': '800 г', 'Пюре картофельное':...","{'Фарш куриный': '800 г', 'Пюре картофельное':..."
4,https://www.povarenok.ru/recipes/show/91919/,Рецепт вишневой наливки,"{'Вишня': '1 кг', 'Водка': '1 л', 'Сахар': '30...","{'Вишня': '1 кг', 'Водка': '1 л', 'Сахар-песок..."
5,https://www.povarenok.ru/recipes/show/167765/,Песочный пирог с тыквенным суфле,"{'Масло сливочное': '100 г', 'Сахар': '50 г', ...","{'Масло сливочное': '100 г', 'Сахар-песок': '5..."
6,https://www.povarenok.ru/recipes/show/100230/,Шоколадные конфеты ручной работы,"{'Какао-масло': '100 г', 'Какао тертое': '200 ...","{'Какао-масло': '100 г', 'Какао тертое': '200 ..."
7,https://www.povarenok.ru/recipes/show/96257/,Рыбно-тыквенный гратен,"{'Масло растительное': '3 ст. л.', 'Рыба': '40...","{'Масло растительное': '3 ст. л.', 'Рыба': '40..."
8,https://www.povarenok.ru/recipes/show/139360/,"Плов с креветками из риса ""Басмати""","{'Рис': '2 стак.', 'Вода': '4 стак.', 'Морковь...","{'Рис': '2 стак.', 'Вода': '4 стак.', 'Морковь..."
9,https://www.povarenok.ru/recipes/show/96774/,Молочное мороженое,"{'Молоко': '450 г', 'Желток яичный': '3 шт', '...","{'Молоко': '450 г', 'Желток яичный': '3 шт', '..."


### Почистим данные 

In [40]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
import ast

all_ingredients = set()
for ingredients_str in data['ingredients_normalized']:
    ingredients_parsed = ast.literal_eval(ingredients_str)
    all_ingredients.update(ingredients_parsed.keys())

all_ingredients = sorted(list(all_ingredients))
ingredient_to_idx = {ing: idx for idx, ing in enumerate(all_ingredients)}

print(f"Ингредиентов: {len(all_ingredients)}")
print(f"Рецептов: {len(data)}")

Ингредиентов: 1072
Рецептов: 146581


In [41]:
stop_words_drop = ['Соль', 'Сахар-песок', 'Перец черный молотый', 'Мука пшеничная',
'Сода', 'Сода гашеная уксусом']

In [42]:
from tqdm import tqdm

interactions = []
for idx, row in tqdm(data.iterrows(), total=len(data)):
    ingredients_parsed = ast.literal_eval(row['ingredients_normalized'])
    recipe_id = row.get('url', idx) 
        
    for ingredient in ingredients_parsed.keys():
        interactions.append((recipe_id, ingredient))

interactions_df = pd.DataFrame(interactions, columns=['recipe_id', 'ingredient_id'])
print(f"Interactions {len(interactions_df)}")

unique_recipes = interactions_df['recipe_id'].unique()
all_unique_ingredients = interactions_df['ingredient_id'].unique()
unique_ingredients = [
    ingredient for ingredient in all_unique_ingredients if ingredient not in stop_words_drop
]

recipe2id = {recipe: i for i, recipe in enumerate(unique_recipes)}
item2id = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}

id2recipe = {i: recipe for recipe, i in recipe2id.items()}
id2item = {i: ingredient for ingredient, i in item2id.items()}

interactions_df['user_id'] = interactions_df['recipe_id'].map(recipe2id)
interactions_df['item_id'] = interactions_df['ingredient_id'].map(item2id)
interactions_df.dropna(subset=['item_id'], inplace=True)
interactions_df['item_id'] = interactions_df['item_id'].astype(int)


100%|██████████| 146581/146581 [00:15<00:00, 9643.47it/s] 


Interactions 1282208


In [43]:
print(len(unique_recipes), len(unique_ingredients))

146564 1066


In [44]:
interactions_df.head(7)

Unnamed: 0,recipe_id,ingredient_id,user_id,item_id
0,https://www.povarenok.ru/recipes/show/164365/,Молоко,0,0
1,https://www.povarenok.ru/recipes/show/164365/,Клубника,0,1
3,https://www.povarenok.ru/recipes/show/1306/,Сыр твердый,1,2
4,https://www.povarenok.ru/recipes/show/1306/,Чеснок,1,3
5,https://www.povarenok.ru/recipes/show/1306/,Яйцо куриное,1,4
6,https://www.povarenok.ru/recipes/show/1306/,Грейпфрут,1,5
7,https://www.povarenok.ru/recipes/show/1306/,Лук зеленый,1,6


In [45]:
rows = interactions_df['user_id'].values
cols = interactions_df['item_id'].values
values = np.ones(len(rows), dtype=np.int8)

num_users= len(unique_recipes)
num_items= len(unique_ingredients)

interactions_matrix = csr_matrix(
    (values, (rows, cols)), 
    shape=(num_users, num_items)
)

In [46]:
print(f"Sparsity:{(1 -interactions_matrix.nnz /(num_users * num_items)) * 100:.2f}%")

Sparsity:99.32%


In [47]:
'''
import scipy.sparse as sp
import pickle

sp.save_npz('recsys_interactions.npz', interactions_matrix)

artifacts = {
    'recipe_to_cat': recipe_to_cat,
    'ingredient_to_cat': ingredient_to_cat,
    'cat_to_recipe': cat_to_recipe,
    'cat_to_ingredient': cat_to_ingredient,
    'unique_recipes': unique_recipes,
    'unique_ingredients': unique_ingredients
}

with open('model_artifacts.pkl', 'wb') as f:
    pickle.dump(artifacts, f)
print("model_artifacts.pkl")
'''

'\nimport scipy.sparse as sp\nimport pickle\n\nsp.save_npz(\'recsys_interactions.npz\', interactions_matrix)\n\nartifacts = {\n    \'recipe_to_cat\': recipe_to_cat,\n    \'ingredient_to_cat\': ingredient_to_cat,\n    \'cat_to_recipe\': cat_to_recipe,\n    \'cat_to_ingredient\': cat_to_ingredient,\n    \'unique_recipes\': unique_recipes,\n    \'unique_ingredients\': unique_ingredients\n}\n\nwith open(\'model_artifacts.pkl\', \'wb\') as f:\n    pickle.dump(artifacts, f)\nprint("model_artifacts.pkl")\n'

In [48]:
interactions_matrix.toarray()

array([[1, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(146564, 1066), dtype=int8)

### Train test split

In [49]:
def train_val_test_split(
    interactions_df: pd.DataFrame,
    user_col: str = 'user_id',
    item_col: str = 'item_id',
    k_core: int = 3,
    test_size: float = 0.2,
    val_size: float = 0.1,
    seed: int = 42
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:

    while True:
        #взаимодействия для пользователей
        user_counts = interactions_df.groupby(user_col)[item_col].count()
        valid_users = user_counts[user_counts >= k_core].index
        
        #взаимодействия для айтемов
        item_counts = interactions_df.groupby(item_col)[user_col].count()
        valid_items = item_counts[item_counts >= k_core].index
        
        before_count = len(interactions_df)
        filtered_df = interactions_df[
            (interactions_df[user_col].isin(valid_users)) &
            (interactions_df[item_col].isin(valid_items))
        ]
        after_count = len(filtered_df)
        
        if before_count == after_count:
            break
        interactions_df = filtered_df

    print(f"Interactions left: {len(filtered_df)}")

    shuffled_df = filtered_df.sample(frac=1, random_state=seed)
    
    shuffled_df['user_interaction_rank'] = shuffled_df.groupby(user_col).cumcount()
    shuffled_df['user_interaction_count'] = shuffled_df.groupby(user_col)[user_col].transform('count')
    
    shuffled_df['is_test'] = shuffled_df['user_interaction_rank'] < (shuffled_df['user_interaction_count'] * test_size)
    
    train_val_df = shuffled_df[~shuffled_df['is_test']].copy()
    test_df = shuffled_df[shuffled_df['is_test']].copy()
    
    print(f"Train+val: {len(train_val_df)}")
    print(f"Test:{len(test_df)}")
    
    val_frac = val_size / (1 - test_size) 
    train_val_df['user_interaction_rank'] = train_val_df.groupby(user_col).cumcount()
    train_val_df['user_interaction_count'] = train_val_df.groupby(user_col)[user_col].transform('count')
    
    train_val_df['is_val'] = train_val_df['user_interaction_rank'] < (train_val_df['user_interaction_count'] * val_frac)
    
    train_df = train_val_df[~train_val_df['is_val']].copy()
    val_df = train_val_df[train_val_df['is_val']].copy()
    
    print(f"Train: {len(train_df)}")
    print(f"Val: {len(val_df)}")
    
    columns_to_drop = ['user_interaction_rank', 'user_interaction_count', 'is_test', 'is_val']
    train_df.drop(columns=columns_to_drop, inplace=True)
    val_df.drop(columns=columns_to_drop, inplace=True)
    test_df.drop(columns=columns_to_drop, errors ='ignore', inplace=True)

    print(f"total interactions: {len(train_df) + len(val_df) + len(test_df)}")
    print(f"unique users(recipes): train {train_df[user_col].nunique()}, val {val_df[user_col].nunique()}, test {test_df[user_col].nunique()} ")
    print(f"unique items: train {train_df[item_col].nunique()}, val {val_df[item_col].nunique()}, test {test_df[item_col].nunique()} ")

    return train_df, val_df, test_df

In [50]:
train, val, test = train_val_test_split(
    interactions_df, 
    k_core=5, 
    test_size=0.2,
    val_size=0.15   
)

print(train.shape)
print(val.shape)
print(test.shape)

Interactions left: 986970
Train+val: 738124
Test:248846
Train: 548383
Val: 189741
total interactions: 986970
unique users(recipes): train 123767, val 123767, test 123767 
unique items: train 881, val 864, test 867 
(548383, 4)
(189741, 4)
(248846, 4)


In [51]:
train.head()

Unnamed: 0,recipe_id,ingredient_id,user_id,item_id
1057780,https://www.povarenok.ru/recipes/show/53123/,Картофель,120835,43
115409,https://www.povarenok.ru/recipes/show/6201/,Крупа манная,13159,135
1198948,https://www.povarenok.ru/recipes/show/87701/,Паприка сладкая,136994,85
652094,https://www.povarenok.ru/recipes/show/20172/,Колбаски,74417,288
356868,https://www.povarenok.ru/recipes/show/96022/,Яйцо куриное,40768,4


#### for eval

In [52]:
train_grouped = train.groupby('user_id')['item_id'].apply(list).reset_index()
train_grouped.rename(columns={'item_id': 'train_interactions'}, inplace=True)

val_grouped = val.groupby('user_id')['item_id'].apply(list).reset_index()
val_grouped.rename(columns={'item_id': 'val_interactions'}, inplace=True)

test_grouped = test.groupby('user_id')['item_id'].apply(list).reset_index()
test_grouped.rename(columns={'item_id': 'test_interactions'}, inplace=True)

train_val_joined = pd.merge(train_grouped, val_grouped, on='user_id', how='outer')
full_grouped_data = pd.merge(train_val_joined, test_grouped, on='user_id', how='outer')

In [54]:
from math import log2

def hit_rate(recommendations, ground_truth, k=100):
    """
    Calculate HitRate@k

    Args:
        recommendations: dict {user_id: list of recommended item_ids}
        ground_truth: dict {user_id: set of relevant item_ids}
        k: cutoff level
    """
    hits = 0
    total_users = len(recommendations)

    for user_id, recs in recommendations.items():
        user_recs = recs[:k]
        user_truth = ground_truth.get(user_id, set())
        if any(item in user_truth for item in user_recs):
            hits += 1

    return hits / total_users if total_users > 0 else 0.0


def precision(recommendations, ground_truth, k=100):
    """
    Calculate Precision@k
    """
    precisions = []

    for user_id, recs in recommendations.items():
        user_recs = recs[:k]
        user_truth = ground_truth.get(user_id, set())
        relevant_count = sum(1 for item in user_recs if item in user_truth)
        user_precision = relevant_count / k
        precisions.append(user_precision)

    return np.mean(precisions) if precisions else 0.0


def recall(recommendations, ground_truth, k=100):
    """
    Calculate Recall@k
    """
    recalls = []

    for user_id, recs in recommendations.items():
        user_recs = recs[:k]
        user_truth = ground_truth.get(user_id, set())

        if not user_truth:  # If no ground truth items, recall is 0
            recalls.append(0.0)
            continue

        relevant_count = sum(1 for item in user_recs if item in user_truth)
        user_recall = relevant_count / len(user_truth)
        recalls.append(user_recall)

    return np.mean(recalls) if recalls else 0.0

def mrr(recommendations, ground_truth, k=100):
    """
    Calculate MRR@k
    """
    reciprocal_ranks = []

    for user_id, recs in recommendations.items():
        user_recs = recs[:k]
        user_truth = ground_truth.get(user_id, set())

        user_rr = 0.0
        for rank, item in enumerate(user_recs, 1):
            if item in user_truth:
                user_rr = 1.0 / rank
                break

        reciprocal_ranks.append(user_rr)

    return np.mean(reciprocal_ranks) if reciprocal_ranks else 0.0

def ndcg(recommendations, ground_truth, k=100, binary_relevance=True):
    """
    Calculate NDCG@k

    Args:
        binary_relevance: if True, uses binary relevance (0/1),
                         if False, expects relevance scores in ground_truth
    """
    ndcg_scores = []

    for user_id, recs in recommendations.items():
        user_recs = recs[:k]
        user_truth = ground_truth.get(user_id, {})

        # Calculate DCG
        dcg = 0.0
        for rank, item in enumerate(user_recs, 1):
            if binary_relevance:
                rel = 1.0 if item in user_truth else 0.0
            else:
                rel = user_truth.get(item, 0.0)

            dcg += rel / (log2(rank + 1) if rank == 1 else 1)

        # Calculate IDCG
        if binary_relevance:
            # For binary relevance, ideal is all 1's sorted first
            num_relevant = len(user_truth)
            ideal_gains = [1.0] * min(k, num_relevant)
        else:
            # For graded relevance, take top-k relevance scores
            ideal_gains = sorted(user_truth.values(), reverse=True)[:k]

        idcg = 0.0
        for rank, rel in enumerate(ideal_gains, 1):
            idcg += rel / (log2(rank + 1) if rank == 1 else 1)

        user_ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(user_ndcg)

    return np.mean(ndcg_scores) if ndcg_scores else 0.0
        

In [55]:
def evaluate_model(df: pd.DataFrame, preds_col: str, gt_col: str, top_k: int = 20
) -> dict:
    recommendations = pd.Series(df[preds_col].values,index=df['user_id']).to_dict()

    ground_truth = pd.Series(df[gt_col].apply(set).values, index=df['user_id']).to_dict()

    hr = hit_rate(recommendations, ground_truth, k=top_k)
    p = precision(recommendations, ground_truth, k=top_k)
    r = recall(recommendations, ground_truth, k=top_k)
    m = mrr(recommendations, ground_truth, k=top_k)
    n = ndcg(recommendations, ground_truth, k=top_k)
    
    results = {
        f'hit_rate@{top_k}': hr,
        f'precision@{top_k}': p,
        f'recall@{top_k}': r,
        f'mrr@{top_k}': m,
        f'ndcg@{top_k}': n
    }
    
    return results

### model 1: EASE

In [56]:
from scipy import sparse as sps

matrix_train = sps.coo_matrix(
    (np.ones(train.shape[0]), (train['user_id'], train['item_id'])),
    shape=(len(recipe2id), len(item2id)),
)
matrix_train

<COOrdinate sparse matrix of dtype 'float64'
	with 548383 stored elements and shape (146564, 1066)>

In [57]:
%%time

# Обучаем конечную модель
# Мы взяли реализацию из RecBole

def fit_ease(X, reg_weight=100):
    
    # gram matrix
    G = X.T @ X

    # add reg to diagonal
    G += reg_weight * sps.identity(G.shape[0])

    # convert to dense because inverse will be dense
    G = G.todense()

    # invert. this takes most of the time
    P = np.linalg.inv(G)
    B = P / (-np.diag(P))
    # zero out diag
    np.fill_diagonal(B, 0.)
    
    return B

w = fit_ease(matrix_train)

CPU times: user 149 ms, sys: 36 ms, total: 185 ms
Wall time: 217 ms


In [58]:
def get_preds(user_interactions, item2id, id2item, model_weights):
    encoded_ids = user_interactions
    
    vector = np.zeros(len(item2id))
    vector[encoded_ids] = 1
    
    preds = vector @ model_weights
    preds[encoded_ids] = -np.inf  # Filter out items already seen
    
    top_indices = np.argsort(-preds)[:20]
    
    decoded = [id2item[i] for i in top_indices]
    
    return  top_indices

In [59]:
w = np.asarray(w)

tqdm.pandas()
full_grouped_data['ease_preds'] = full_grouped_data['train_interactions'].progress_apply(
    lambda interactions: get_preds(interactions, item2id, id2item, w)
)
full_grouped_data.head()

100%|██████████| 123767/123767 [00:41<00:00, 2973.80it/s]


Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions,ease_preds
0,1,"[6, 3, 5, 7]",[4],"[2, 8]","[55, 18, 8, 51, 58, 150, 11, 81, 67, 127, 71, ..."
1,2,"[11, 8, 10]",[3],[9],"[2, 4, 3, 39, 48, 18, 43, 16, 58, 50, 9, 186, ..."
2,3,"[0, 2, 16, 18, 13, 10]","[12, 17]","[14, 15]","[4, 3, 15, 156, 39, 11, 50, 61, 43, 8, 186, 24..."
3,4,"[19, 20, 22]",[23],[21],"[4, 70, 24, 15, 69, 0, 164, 29, 68, 35, 23, 92..."
4,5,"[24, 25, 4, 29, 15]","[28, 22]","[26, 27]","[68, 0, 18, 35, 10, 156, 22, 61, 69, 64, 92, 7..."


In [60]:
evaluate_model(full_grouped_data, 'ease_preds', 'test_interactions', top_k=6)

{'hit_rate@6': 0.59069865149838,
 'precision@6': np.float64(0.12156175178628659),
 'recall@6': np.float64(0.3669534959507246),
 'mrr@6': np.float64(0.3605962547905877),
 'ndcg@6': np.float64(0.3669534959507246)}

In [61]:
evaluate_model(full_grouped_data, 'ease_preds', 'test_interactions', top_k=10)

{'hit_rate@10': 0.696154871653995,
 'precision@10': np.float64(0.092590108833534),
 'recall@10': np.float64(0.4643900770533879),
 'mrr@10': np.float64(0.37347038489245216),
 'ndcg@10': np.float64(0.4643900770533879)}

In [62]:
evaluate_model(full_grouped_data, 'ease_preds', 'test_interactions', top_k=20)

{'hit_rate@20': 0.8174311407725807,
 'precision@20': np.float64(0.06028787964481648),
 'recall@20': np.float64(0.6027340351897786),
 'mrr@20': np.float64(0.3819945605717543),
 'ndcg@20': np.float64(0.6027340351897786)}

In [None]:
#навайбкодила как могла 
def inspect_recommendations(
    user_id: int, 
    df: pd.DataFrame, 
    id2recipe: dict, 
    id2item: dict,
    preds: str
):

    user_data = df[df['user_id'] == user_id] 
    user_data = user_data.iloc[0]
    recipe_name = id2recipe.get(user_id, "Unknown Recipe")
    train_items = [id2item.get(i, "Unknown") for i in user_data['train_interactions']]
    test_items = [id2item.get(i, "Unknown") for i in user_data['test_interactions']]
    pred_items = [id2item.get(i, "Unknown") for i in user_data[preds]]
    
    
    print("="*80)
    print(f"RECIPE: {recipe_name} (User ID: {user_id})")
    print("="*80)
    
    print(f"\n--- Ingredients in Training Set ({len(train_items)}) ---")
    print(", ".join(train_items))
    
    print(f"\n--- Ground Truth Ingredients in Test Set ({len(test_items)}) ---")
    print(", ".join(test_items))
    
    print(f"\n--- Top Recommended Ingredients ({len(pred_items)}) ---")
    successful_preds = set(test_items).intersection(set(pred_items))
    
    display_preds = []
    for item in pred_items:
        if item in successful_preds:
            display_preds.append(f"✅ {item}") # Add a checkmark for hits
        else:
            display_preds.append(item)
            
    print(", ".join(display_preds))
    print("\n" + "="*80)

inspect_recommendations(user_id=1, df=full_grouped_data, id2recipe=id2recipe, id2item=id2item, preds='ease_preds')
inspect_recommendations(user_id=146561, df=full_grouped_data, id2recipe=id2recipe, id2item=id2item, preds='ease_preds')

RECIPE: https://www.povarenok.ru/recipes/show/1306/ (User ID: 1)

--- Ingredients in Training Set (4) ---
Лук зеленый, Чеснок, Грейпфрут, Листья салата

--- Ground Truth Ingredients in Test Set (2) ---
Сыр твердый, Майонез

--- Top Recommended Ingredients (20) ---
Масло оливковое, Масло растительное, ✅ Майонез, Соевый соус, Огурец, Укроп, Помидор, Уксус, Горчица, Петрушка, Сок лимонный, Яйцо куриное, ✅ Сыр твердый, Мед, Перец болгарский, Помидоры черри, Филе куриное, Лимон, Сметана, Зелень

RECIPE: https://www.povarenok.ru/recipes/show/80590/ (User ID: 146561)

--- Ingredients in Training Set (4) ---
Хрен, Петрушка, Лук зеленый, Сухари панировочные

--- Ground Truth Ingredients in Test Set (2) ---
Лук репчатый, Грибы

--- Top Recommended Ingredients (20) ---
Яйцо куриное, Укроп, Чеснок, Масло растительное, Огурец, Картофель, Сметана, Горчица, Сыр твердый, Сок лимонный, Фарш мясной, Масло оливковое, Молоко, Перец красный жгучий, Уксус, Хлеб, Свекла, Помидор, Масло сливочное, Филе курино

### hyperparameters tuning

In [75]:
import optuna

def objective(trial):
    regul = trial.suggest_int('regul', 50, 2000)
    w = np.asarray(fit_ease(matrix_train, regul))
    tqdm.pandas()
    full_grouped_data['ease_preds'] = full_grouped_data['train_interactions'].progress_apply(
        lambda interactions: get_preds(interactions, item2id, id2item, w)
    )
    return evaluate_model(full_grouped_data, 'ease_preds', 'val_interactions', top_k=6)['hit_rate@6']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-14 01:05:21,150] A new study created in memory with name: no-name-a411cda7-7546-417e-8a78-efa7e0c0aa9b
100%|██████████| 123767/123767 [00:27<00:00, 4481.80it/s]
[I 2025-11-14 01:05:58,074] Trial 0 finished with value: 0.48049156883498834 and parameters: {'regul': 656}. Best is trial 0 with value: 0.48049156883498834.
100%|██████████| 123767/123767 [00:39<00:00, 3118.48it/s]
[I 2025-11-14 01:06:51,279] Trial 1 finished with value: 0.486680617612126 and parameters: {'regul': 179}. Best is trial 1 with value: 0.486680617612126.
100%|██████████| 123767/123767 [00:26<00:00, 4738.45it/s]
[I 2025-11-14 01:07:26,042] Trial 2 finished with value: 0.4848788449263535 and parameters: {'regul': 294}. Best is trial 1 with value: 0.486680617612126.
100%|██████████| 123767/123767 [00:25<00:00, 4779.32it/s]
[I 2025-11-14 01:07:58,675] Trial 3 finished with value: 0.4726381022405003 and parameters: {'regul': 1666}. Best is trial 1 with value: 

In [76]:
study.best_trial.params['regul']

72

results on train + val (optimal parameters)

In [77]:
train_val_df = pd.concat([train, val])

num_users = len(recipe2id)
num_items = len(item2id)

interactions_matrix_train_val = sps.coo_matrix(
    (np.ones(train_val_df.shape[0]), (train_val_df['user_id'], train_val_df['item_id'])),
    shape=(num_users, num_items),
).tocsr()

In [None]:
w_final =fit_ease(interactions_matrix_train_val, reg_weight=study.best_trial.params['regul'])
w_final_array= np.asarray(w_final) 

train_val_interactions = full_grouped_data['train_interactions']

tqdm.pandas()
full_grouped_data['ease_full_preds'] = train_val_interactions.progress_apply(
    lambda interactions: get_preds(interactions, item2id, id2item, w_final_array)
)

100%|██████████| 123767/123767 [00:58<00:00, 2099.46it/s]


In [80]:
evaluate_model(df=full_grouped_data,preds_col='ease_full_preds',gt_col='test_interactions',top_k=6)

{'hit_rate@6': 0.5965968311423885,
 'precision@6': np.float64(0.12295011325043563),
 'recall@6': np.float64(0.3710089657717054),
 'mrr@6': np.float64(0.3642634951158225),
 'ndcg@6': np.float64(0.3710089657717054)}

In [81]:
evaluate_model(full_grouped_data, 'ease_preds', 'test_interactions', top_k=6)

{'hit_rate@6': 0.5845984794008096,
 'precision@6': np.float64(0.11997543771764686),
 'recall@6': np.float64(0.36229905117411476),
 'mrr@6': np.float64(0.3552171957522333),
 'ndcg@6': np.float64(0.36229905117411476)}

## slim

In [82]:
import warnings
warnings.filterwarnings('ignore')


In [83]:
from sklearn.linear_model import ElasticNet

def train_slim(
    train_matrix: sps.csr_matrix, 
    l1_reg: float = 0.001, 
    l2_reg: float = 0.0001
) -> sps.csr_matrix:
    num_items = train_matrix.shape[1]
    
    train_matrix_csc = train_matrix.tocsc()
    rows, cols, data = [], [], []

    model = ElasticNet(
        alpha=l1_reg + l2_reg,
        l1_ratio=l1_reg / (l1_reg + l2_reg) if (l1_reg + l2_reg) > 0 else 0,
        positive=True,
        fit_intercept=False,
        copy_X=False,   
        precompute=True, 
        max_iter=300, 
        tol=1e-4 
    )

    for j in tqdm(range(num_items)):
        y = train_matrix_csc[:, j].toarray().ravel()
        
        # (w_jj = 0) j-th column of the training data to zero
        start_pos = train_matrix_csc.indptr[j]
        end_pos = train_matrix_csc.indptr[j + 1]
        
        original_values = train_matrix_csc.data[start_pos:end_pos].copy()
        train_matrix_csc.data[start_pos:end_pos] = 0.0

        model.fit(train_matrix_csc, y)
        coeffs = model.coef_
        non_zero_indices = coeffs.nonzero()[0]
        if len(non_zero_indices) > 0:
            rows.extend(non_zero_indices)
            cols.extend([j] * len(non_zero_indices))
            data.extend(coeffs[non_zero_indices])
            
        train_matrix_csc.data[start_pos:end_pos] = original_values


    W_slim = sps.csr_matrix((data, (rows, cols)), shape=(num_items, num_items))
    
    return W_slim

In [94]:
w_slim = train_slim(matrix_train, l1_reg=0.001, l2_reg=0.001)

100%|██████████| 1066/1066 [03:34<00:00,  4.97it/s]


In [85]:
def get_slim_predictions(user_interactions, model_weights, top_k=20):
    vector = np.zeros(model_weights.shape[1])
    vector[user_interactions] = 1
    
    scores = (sps.csr_matrix(vector) @ model_weights).toarray().flatten()
    
    scores[user_interactions] = -np.inf
    
    top_indices = np.argsort(-scores)[:top_k].tolist()
    
    return top_indices

In [95]:
tqdm.pandas()
full_grouped_data['slim_preds'] = full_grouped_data['train_interactions'].progress_apply(
    lambda interactions: get_slim_predictions(interactions, w_slim)
)
full_grouped_data.head()

  0%|          | 0/123767 [00:00<?, ?it/s]

100%|██████████| 123767/123767 [00:16<00:00, 7550.67it/s]


Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions,ease_preds,ease_full_preds,slim_preds
0,1,"[6, 3, 5, 7]",[4],"[2, 8]","[55, 18, 8, 58, 51, 150, 11, 4, 81, 127, 71, 2...","[55, 18, 8, 58, 51, 81, 11, 150, 67, 164, 127,...","[55, 18, 10, 150, 4, 58, 51, 11, 127, 8, 39, 1..."
1,2,"[11, 8, 10]",[3],[9],"[4, 3, 2, 39, 18, 48, 43, 16, 58, 50, 9, 73, 1...","[2, 4, 3, 39, 48, 43, 16, 18, 58, 9, 73, 186, ...","[3, 4, 2, 39, 18, 48, 43, 16, 58, 50, 55, 127,..."
2,3,"[0, 2, 16, 18, 13, 10]","[12, 17]","[14, 15]","[4, 3, 15, 39, 11, 156, 43, 8, 50, 61, 24, 186...","[4, 156, 15, 3, 11, 50, 39, 45, 186, 73, 61, 4...","[4, 3, 39, 15, 11, 43, 8, 24, 50, 156, 61, 48,..."
3,4,"[19, 20, 22]",[23],[21],"[4, 70, 15, 24, 68, 0, 164, 29, 69, 23, 76, 35...","[24, 70, 15, 4, 164, 69, 29, 23, 0, 18, 76, 64...","[70, 15, 4, 164, 76, 68, 24, 155, 23, 0, 27, 2..."
4,5,"[24, 25, 4, 29, 15]","[28, 22]","[26, 27]","[68, 0, 18, 35, 10, 156, 61, 22, 69, 64, 92, 1...","[68, 0, 18, 35, 156, 22, 10, 61, 69, 64, 92, 1...","[0, 68, 18, 35, 61, 69, 156, 10, 92, 64, 22, 1..."


In [96]:
evaluate_model(df=full_grouped_data,preds_col='slim_preds',gt_col='test_interactions',top_k=6)

{'hit_rate@6': 0.5339549314437613,
 'precision@6': np.float64(0.10706542670232506),
 'recall@6': np.float64(0.32323977042884344),
 'mrr@6': np.float64(0.32394539739995315),
 'ndcg@6': np.float64(0.32323977042884344)}

In [88]:
import optuna

def objective(trial):
    l1_reg = trial.suggest_float('l1_reg', 1e-4, 1e-1)
    l2_reg = trial.suggest_float('l2_reg', 1e-4, 1e-1)
    w = train_slim(interactions_matrix, l1_reg=l1_reg, l2_reg=l2_reg)
    tqdm.pandas()
    full_grouped_data['slim_preds'] = full_grouped_data['train_interactions'].progress_apply(
    lambda interactions: get_slim_predictions(interactions, w)
    )
    return evaluate_model(full_grouped_data, 'slim_preds', 'val_interactions', top_k=6)['hit_rate@6']

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=15)

[I 2025-11-14 01:21:57,317] A new study created in memory with name: no-name-342acc70-5994-482e-8576-43f056c5b7e1
100%|██████████| 1066/1066 [00:12<00:00, 86.45it/s]
100%|██████████| 123767/123767 [00:16<00:00, 7456.84it/s]
[I 2025-11-14 01:22:27,333] Trial 0 finished with value: 0.18739243901847827 and parameters: {'l1_reg': 0.07989988094225131, 'l2_reg': 0.09963262840332024}. Best is trial 0 with value: 0.18739243901847827.
100%|██████████| 1066/1066 [00:14<00:00, 71.17it/s]
100%|██████████| 123767/123767 [00:15<00:00, 8045.38it/s]
[I 2025-11-14 01:22:58,537] Trial 1 finished with value: 0.41206460526634725 and parameters: {'l1_reg': 0.006139654412824998, 'l2_reg': 0.09277566889523463}. Best is trial 1 with value: 0.41206460526634725.
100%|██████████| 1066/1066 [00:13<00:00, 79.79it/s]
100%|██████████| 123767/123767 [00:51<00:00, 2409.98it/s]
[I 2025-11-14 01:24:07,417] Trial 2 finished with value: 0.18739243901847827 and parameters: {'l1_reg': 0.08061538167750575, 'l2_reg': 0.032065

In [89]:
study.best_trial.params['l1_reg'], study.best_trial.params['l2_reg']

(0.0011867571092305006, 0.09935674383304606)

In [97]:
w_slim = train_slim(interactions_matrix_train_val, l1_reg=study.best_trial.params['l1_reg'],
l2_reg=study.best_trial.params['l2_reg'])

  0%|          | 0/1066 [00:00<?, ?it/s]

100%|██████████| 1066/1066 [05:12<00:00,  3.41it/s]


In [98]:
tqdm.pandas()
full_grouped_data['slim_full_preds'] = full_grouped_data['train_interactions'].progress_apply(
    lambda interactions: get_slim_predictions(interactions, w_slim)
)
full_grouped_data.head()

100%|██████████| 123767/123767 [00:22<00:00, 5602.95it/s]


Unnamed: 0,user_id,train_interactions,val_interactions,test_interactions,ease_preds,ease_full_preds,slim_preds,slim_full_preds
0,1,"[6, 3, 5, 7]",[4],"[2, 8]","[55, 18, 8, 58, 51, 150, 11, 4, 81, 127, 71, 2...","[55, 18, 8, 58, 51, 81, 11, 150, 67, 164, 127,...","[55, 18, 10, 150, 4, 58, 51, 11, 127, 8, 39, 1...","[10, 18, 55, 11, 51, 8, 39, 150, 4, 127, 81, 4..."
1,2,"[11, 8, 10]",[3],[9],"[4, 3, 2, 39, 18, 48, 43, 16, 58, 50, 9, 73, 1...","[2, 4, 3, 39, 48, 43, 16, 18, 58, 9, 73, 186, ...","[3, 4, 2, 39, 18, 48, 43, 16, 58, 50, 55, 127,...","[3, 39, 4, 18, 2, 43, 16, 48, 50, 58, 55, 24, ..."
2,3,"[0, 2, 16, 18, 13, 10]","[12, 17]","[14, 15]","[4, 3, 15, 39, 11, 156, 43, 8, 50, 61, 24, 186...","[4, 156, 15, 3, 11, 50, 39, 45, 186, 73, 61, 4...","[4, 3, 39, 15, 11, 43, 8, 24, 50, 156, 61, 48,...","[4, 3, 39, 15, 11, 43, 24, 8, 50, 156, 61, 48,..."
3,4,"[19, 20, 22]",[23],[21],"[4, 70, 15, 24, 68, 0, 164, 29, 69, 23, 76, 35...","[24, 70, 15, 4, 164, 69, 29, 23, 0, 18, 76, 64...","[70, 15, 4, 164, 76, 68, 24, 155, 23, 0, 27, 2...","[15, 4, 70, 164, 68, 76, 24, 23, 155, 0, 29, 2..."
4,5,"[24, 25, 4, 29, 15]","[28, 22]","[26, 27]","[68, 0, 18, 35, 10, 156, 61, 22, 69, 64, 92, 1...","[68, 0, 18, 35, 156, 22, 10, 61, 69, 64, 92, 1...","[0, 68, 18, 35, 61, 69, 156, 10, 92, 64, 22, 1...","[0, 68, 18, 35, 61, 10, 156, 69, 92, 64, 147, ..."


In [99]:
evaluate_model(df=full_grouped_data,preds_col='slim_full_preds',gt_col='test_interactions',top_k=6)

{'hit_rate@6': 0.5310866386031818,
 'precision@6': np.float64(0.10623725764272113),
 'recall@6': np.float64(0.32126172566192923),
 'mrr@6': np.float64(0.3157123196544043),
 'ndcg@6': np.float64(0.32126172566192923)}