In [33]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.preprocessing import LabelEncoder

In [15]:
!pip install implicit



In [27]:
history = pd.read_csv("User Listening History.csv")

0.7.2


In [17]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix

In [65]:
from implicit.evaluation import ndcg_at_k, mean_average_precision_at_k

In [79]:
mlflow.set_experiment("ials")

<Experiment: artifact_location='mlflow-artifacts:/117777105458341967', creation_time=1739880424523, experiment_id='117777105458341967', last_update_time=1739880424523, lifecycle_stage='active', name='ials', tags={}>

In [None]:
import os

os.environ["OPENBLAS_NUM_THREADS"] = "1"

In [143]:
def split_data(history, test_size=0.2, random_state=42):
    # Создаем копию, чтобы не менять исходный DataFrame
    history = history.copy()
    np.random.seed(random_state)
    # Генерируем случайное число для каждой строки
    history["rand"] = np.random.rand(len(history))

    # Строки с rand < test_size попадут в тестовую выборку
    test_samples = history[history["rand"] < test_size].drop(columns=["rand"])
    train_data = history[history["rand"] >= test_size].drop(columns=["rand"])

    # Формируем тестовые данные как словарь {user_id: [track_id]}
    test_data = test_samples.groupby("user_id")["track_id"].apply(list).to_dict()
    return train_data, test_data

In [None]:
def prepare_data(history_path, minfo_path):
    history = pd.read_csv(history_path, dtype={"user_id": "str", "track_id": "str"})
    minfo = pd.read_csv(minfo_path, dtype={"track_id": "str"})

    history = history[
        (history["playcount"] > 0)
        & (history["user_id"].str.len() > 5)
        & (history["track_id"].isin(minfo["track_id"]))
    ].copy()
    history = history.groupby(["user_id", "track_id"])["playcount"].sum().reset_index()
    if len(history) == 0:
        raise ValueError("Нет данных после очистки")

    user_encoder = LabelEncoder()
    track_encoder = LabelEncoder()
    history["user_encoded"] = user_encoder.fit_transform(history["user_id"])
    history["track_encoded"] = track_encoder.fit_transform(history["track_id"])

    row = history["user_encoded"].values
    col = history["track_encoded"].values
    data = 1 + 0.5 * np.log1p(history["playcount"].values)

    sparse_matrix = csr_matrix(
        (data, (row, col)),
        shape=(len(user_encoder.classes_), len(track_encoder.classes_)),
    )

    # фильтрация неактивных пользователей
    user_activity = np.diff(sparse_matrix.indptr)
    active_users_mask = user_activity > 0
    sparse_matrix = sparse_matrix[active_users_mask]

    active_user_ids = user_encoder.classes_[active_users_mask]
    user_encoder = LabelEncoder()
    user_encoder.fit(active_user_ids)

    return sparse_matrix, user_encoder, track_encoder, minfo

In [None]:
def train_model(sparse_matrix):
    user_item_matrix = sparse_matrix.tocsr()

    model = AlternatingLeastSquares(
        factors=64, regularization=0.5, iterations=50, random_state=42, dtype=np.float32
    )

    alpha = 10  # слишком много треков с низким числом прослушиваний, и когда задаешь большую alpha, модель начинает их переоценивать
    model.fit(user_item_matrix * alpha)

    return model, user_item_matrix

In [None]:
def get_recommendations(
    user_id,
    model,
    user_encoder,
    track_encoder,
    minfo,
    user_item_matrix,
    history_df,
    top_k=10,
):
    try:
        if user_id not in user_encoder.classes_:
            return pd.DataFrame(columns=["artist", "name"])

        internal_id = user_encoder.transform([user_id])[0]

        if internal_id >= user_item_matrix.shape[0]:
            return pd.DataFrame(columns=["artist", "name"])

        # Получаем рекомендации
        item_ids, scores = model.recommend(
            internal_id,
            user_item_matrix[internal_id],
            N=top_k * 2,  # Берем с запасом
            filter_already_liked_items=False,
        )

        # прослушанные треки пользователя, чтобы рекомендавались не они
        listened_tracks = history_df[history_df["user_id"] == user_id][
            "track_id"
        ].unique()

        track_ids = track_encoder.inverse_transform(item_ids)
        recommendations = minfo[minfo["track_id"].isin(track_ids)]

        recommendations = recommendations[
            ~recommendations["track_id"].isin(listened_tracks)
        ]

        return recommendations.head(top_k)

    except Exception as e:
        print(f"Ошибка: {str(e)}")
        return pd.DataFrame(columns=["artist", "name"])

In [159]:
train, test = split_data(history, test_size=0.2)
sparse_matrix, user_encoder, track_encoder, minfo = prepare_data(
    "User Listening History.csv", "Music Info.csv"
)

mlflow.set_tracking_uri("http://127.0.0.1:5000")
with mlflow.start_run():

    # логируем параметры модели
    mlflow.log_param("factors", 64)
    mlflow.log_param("regularization", 0.5)
    mlflow.log_param("iterations", 50)
    mlflow.log_param("alpha", 10)
    mlflow.log_param("k", 20)

    model, user_item_matrix = train_model(sparse_matrix)

    user_id = "b80344d063b5ccb3212f76538f3d9e43d87dca9e"

    if user_id in user_encoder.classes_:
        recs = get_recommendations(
            user_id,
            model,
            user_encoder,
            track_encoder,
            minfo,
            user_item_matrix,
            history,
        )
        print(f"\nТоп-10 рекомендаций для {user_id}:")
        for i, (artist, name) in enumerate(zip(recs["artist"], recs["name"]), 1):
            print(f"{i}. {artist} - {name}")
    else:
        print("Пользователь не найден в системе")

    # подготовка данных к логам метрик

    user_encoder = dict(zip(user_encoder.classes_, range(len(user_encoder.classes_))))
    track_encoder = dict(
        zip(track_encoder.classes_, range(len(track_encoder.classes_)))
    )
    train["user_encoded"] = train["user_id"].map(user_encoder)
    train["track_encoded"] = train["track_id"].map(track_encoder)

    train_csr = csr_matrix(
        (
            train["playcount"].values,
            (train["user_encoded"].values, train["track_encoded"].values),
        )
    )
    user_idxn, track_idxn = [], []

    for user, tracks in test.items():
        if user in user_encoder:  # Проверяем, есть ли пользователь в train
            for track in tracks:
                if track in track_encoder:  # Проверяем, есть ли трек в train
                    user_idxn.append(user_encoder[user])
                    track_idxn.append(track_encoder[track])

    user_idxn = np.array(user_idxn)
    track_idxn = np.array(track_idxn)

    test_csr = csr_matrix(
        (np.ones_like(user_idxn), (user_idxn, track_idxn)),
        shape=(len(user_encoder), len(track_encoder)),
    )

    k = 10
    ndcg_at_10 = ndcg_at_k(model, train_csr, test_csr, 10)
    map_at_10 = mean_average_precision_at_k(model, train_csr, test_csr, 10)
    print(f"NDCG_{k}: {ndcg_at_10:.4f}, MAP_{k}: {map_at_10:.4f}")

    # лог метрик
    mlflow.log_metric("NDCG_10", ndcg_at_10)
    mlflow.log_metric("MAP_10", map_at_10)

    mlflow.sklearn.log_model(model, "ials")

  0%|          | 0/50 [00:00<?, ?it/s]


Топ-10 рекомендаций для b80344d063b5ccb3212f76538f3d9e43d87dca9e:
1. Kings of Leon - Revelry
2. Fleet Foxes - Tiger Mountain Peasant Song
3. Fleet Foxes - Your Protector
4. Fleet Foxes - Sun It Rises
5. Jack Johnson - Staple It Together
6. Fleet Foxes - Oliver James
7. John Mayer - Heartbreak Warfare
8. Zero 7 - In the Waiting Line
9. Fleet Foxes - White Winter Hymnal
10. Jack Johnson - The News


  0%|          | 0/637276 [00:00<?, ?it/s]

  0%|          | 0/637276 [00:00<?, ?it/s]

NDCG_10: 0.3083, MAP_10: 0.2364




🏃 View run shivering-boar-491 at: http://127.0.0.1:5000/#/experiments/117777105458341967/runs/6e266e528fe84a85b90b085e376a8856
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/117777105458341967
